From c69d2bbeddea61acfb382ea53c40e6ebdfa5c85d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 26 Oct 2018 19:20:27 +0800 Subject: [PATCH 0001/2367] Add base impl --- .../operators/fused_embedding_seq_pool_op.cc | 158 +++++++++++++ .../operators/fused_embedding_seq_pool_op.h | 207 ++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.cc create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.h diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc new file mode 100644 index 00000000000..ea960782919 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto output_dims = + framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + AddComment(R"DOC( +Lookup Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class LookupTableOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "lookup_table_grad"; } +}; + +class LookupTableOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, + ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); +REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, + ops::LookupTableOpGradVarTypeInference); + +// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, +// ops::LookupTableKernel); +// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, +// ops::LookupTableGradKernel); +REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); +REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h new file mode 100644 index 00000000000..6dcf4f44a71 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +template +class LookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor + auto *table_var = context.InputVar("W"); + + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } + } + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + // memcpy(output + i * row_width, table + id_index * row_width, + // row_width * sizeof(T)); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } + } + } + } +}; + +template +class LookupTableGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + // auto start = std::chrono::system_clock::now(); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + // auto end = std::chrono::system_clock::now(); + // std::chrono::duration diff = end - start; + + // auto copy_start = std::chrono::system_clock::now(); + std::vector new_rows; + new_rows.resize(ids_num); + std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); + // for (int64_t i = 0; i < ids_num; i++) { + // new_rows.push_back(ids_data[i]); + // } + // auto copy_end = std::chrono::system_clock::now(); + // std::chrono::duration copy_diff = copy_end - copy_start; + // diff += copy_diff; + // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " + // " << ids_num; + + // copy_start = std::chrono::system_clock::now(); + d_table->set_rows(new_rows); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + d_table_value->ShareDataWith(*d_output); + // d_table_value->mutable_data(context.GetPlace()); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad resize table end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // // copy_start = std::chrono::system_clock::now(); + // d_table->set_height(table_dim[0]); + + // auto *d_output_data = d_output->data(); + // auto *d_table_data = d_table_value->data(); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad set height end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // auto d_output_dims = d_output->dims(); + // PADDLE_ENFORCE_EQ( + // d_table_value->dims(), + // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + // // copy_start = std::chrono::system_clock::now(); + // auto blas = math::GetBlas(context); + // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); + // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); + // // for (int i = 0; i != d_output->numel(), ++i) { + // // *(d_table_data++) = *(d_output_data++); + // // } + // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() + // << " + // // " << ids_num << " " << d_output->numel(); + + // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); + } else { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + + int N = table_dim[0]; + int D = table_dim[1]; + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle -- GitLab From 6db8c3bfeafca8b1522de32f56c450db473bd3e9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 5 Nov 2018 15:31:19 +0800 Subject: [PATCH 0002/2367] Implement the infer shape and infer var type --- .../operators/fused_embedding_seq_pool_op.cc | 116 +++++++++++------- .../operators/fused_embedding_seq_pool_op.h | 2 - 2 files changed, 70 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index ea960782919..5ebaf865fcd 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -18,34 +18,53 @@ limitations under the License. */ namespace paddle { namespace operators { -class LookupTableOp : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("W"), - "Input(W) of LookupTableOp should not be null."); + "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), - "Input(Ids) of LookupTableOp should not be null."); + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of LookupTableOp should not be null."); + "Output of FusedEmbeddingSeqPoolOp should not be null."); auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - int ids_rank = ids_dims.size(); + const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); - auto output_dims = - framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); - output_dims.push_back(table_dims[1]); - ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + if (ctx->IsRuntime()) { + Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); - if (ctx->GetOutputsVarType("Out")[0] == - framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Ids", /*->*/ "Out"); + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + size_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", + framework::make_ddim({batch_size, table_dims[1]})); + } else { + // in compile time, the lod level of ids must be 1 + VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); } } @@ -57,7 +76,7 @@ class LookupTableOp : public framework::OperatorWithKernel { } }; -class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("W", @@ -68,42 +87,44 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "contains the ids to be looked up in W. " "The last dimension size must be 1."); AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") .SetDefault(false); - AddAttr("is_distributed", - "(boolean, default false) distributed lookup table.") - .SetDefault(false); - AddAttr("padding_idx", - "(int64, default -1) " - "If the value is -1, it makes no effect to lookup. " - "Otherwise the given value indicates padding the output " - "with zeros whenever lookup encounters it in Ids.") - .SetDefault(kNoPadding); AddComment(R"DOC( -Lookup Table Operator. +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. This operator is used to perform lookups on the parameter W, -then concatenated into a dense tensor. +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. -The input Ids can carry the LoD (Level of Details) information, -or not. And the output only shares the LoD information with input Ids. +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. )DOC"); } }; -class LookupTableOpGradDescMaker +class FusedEmbeddingSeqPoolOpGradDescMaker : public framework::DefaultGradOpDescMaker { using ::paddle::framework::DefaultGradOpDescMaker< true>::DefaultGradOpDescMaker; protected: - virtual std::string GradOpType() const { return "lookup_table_grad"; } + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } }; -class LookupTableOpGrad : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -120,7 +141,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { } }; -class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { @@ -128,13 +150,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to SelectedRows"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; block->Var(out_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to LoDTensor"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); @@ -145,14 +167,16 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, - ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); -REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, - ops::LookupTableOpGradVarTypeInference); - -// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, -// ops::LookupTableKernel); -// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, -// ops::LookupTableGradKernel); -REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); -REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 6dcf4f44a71..24cffc60a80 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,8 +31,6 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -constexpr int64_t kNoPadding = -1; - template class LookupTableKernel : public framework::OpKernel { public: -- GitLab From 17c8014fcd2071920a605f12951d4f6ae1ddcab9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 17:42:43 +0800 Subject: [PATCH 0003/2367] Complete implementation test=develop --- .../operators/fused_embedding_seq_pool_op.cc | 6 + .../operators/fused_embedding_seq_pool_op.h | 182 ++++++------------ 2 files changed, 63 insertions(+), 125 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 5ebaf865fcd..e8627690517 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -93,6 +93,12 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "are supported, sum computes the weighted sum of the " "embedding results for each row.") .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 24cffc60a80..5af234b9375 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,62 +31,54 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; +template +struct EmbeddingVSumFunctor { + void operator()(const DeviceContext &context, LoDTensor *table_t, + LoDTensor *ids_t, LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table->dims()[0]; + int64_t row_width = table->dims()[1]; + int64_t *ids = const_cast(ids_t->data()); + auto ids_lod = ids_t->LoD()[0]; + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i]; + + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * row_width); + + for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * row_width); + } + } + } +}; + template -class LookupTableKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto *ids_t = context.Input("Ids"); // int tensor - auto *output_t = context.Output("Out"); // float tensor - auto *table_var = context.InputVar("W"); - - int64_t padding_idx = context.Attr("padding_idx"); - int64_t *ids = const_cast(ids_t->data()); - int64_t ids_numel = ids_t->numel(); - - if (table_var->IsType()) { - auto *table_t = context.Input("W"); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(T)); - } - } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto *table = table_t.value().data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - auto blas = math::GetBlas(context); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE(ids[i], 0); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - // memcpy(output + i * row_width, table + id_index * row_width, - // row_width * sizeof(T)); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); - } - } + LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context.template device_context(), ids_t, output_t, table_var); } } }; template -class LookupTableGradKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *table_var = context.InputVar("W"); @@ -106,97 +98,37 @@ class LookupTableGradKernel : public framework::OpKernel { // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - // auto start = std::chrono::system_clock::now(); auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); - // auto end = std::chrono::system_clock::now(); - // std::chrono::duration diff = end - start; + auto lod = ids->lod()[0]; + int64_t row_width = table_dim[1]; - // auto copy_start = std::chrono::system_clock::now(); - std::vector new_rows; + framework::Vector new_rows; new_rows.resize(ids_num); std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - // for (int64_t i = 0; i < ids_num; i++) { - // new_rows.push_back(ids_data[i]); - // } - // auto copy_end = std::chrono::system_clock::now(); - // std::chrono::duration copy_diff = copy_end - copy_start; - // diff += copy_diff; - // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " - // " << ids_num; - - // copy_start = std::chrono::system_clock::now(); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, table_dim[1]}); - d_table_value->ShareDataWith(*d_output); - // d_table_value->mutable_data(context.GetPlace()); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad resize table end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // // copy_start = std::chrono::system_clock::now(); - // d_table->set_height(table_dim[0]); - - // auto *d_output_data = d_output->data(); - // auto *d_table_data = d_table_value->data(); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad set height end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // auto d_output_dims = d_output->dims(); - // PADDLE_ENFORCE_EQ( - // d_table_value->dims(), - // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); - // // copy_start = std::chrono::system_clock::now(); - // auto blas = math::GetBlas(context); - // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); - // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); - // // for (int i = 0; i != d_output->numel(), ++i) { - // // *(d_table_data++) = *(d_output_data++); - // // } - // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() - // << " - // // " << ids_num << " " << d_output->numel(); - - // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); - } else { - auto *ids = context.Input("Ids"); - auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); - - auto *ids_data = ids->data(); - - int N = table_dim[0]; - int D = table_dim[1]; - - auto *d_output_data = d_output->data(); - auto *d_table_data = d_table->mutable_data(context.GetPlace()); - - memset(d_table_data, 0, d_table->numel() * sizeof(T)); - - for (int64_t i = 0; i < ids->numel(); ++i) { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + d_table_value->Resize({ids_num, row_width}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); } } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; } } }; -- GitLab From 8a412c0d3308a0c9b90e8e7295ac117b6735b533 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:04:05 +0800 Subject: [PATCH 0004/2367] Complete impl --- .../operators/fused_embedding_seq_pool_op.cc | 18 ++++--- .../operators/fused_embedding_seq_pool_op.h | 49 +++++++++++-------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index e8627690517..6b6b898d4c7 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -42,8 +42,14 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { // we only support sum now PADDLE_ENFORCE_EQ(combiner, "sum"); + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + if (ctx->IsRuntime()) { - Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); const auto& ids_lod = ids_var->Get().lod(); // in run time, the LoD of ids must be 1 @@ -51,20 +57,20 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { "The LoD level of Input(Ids) must be 1"); PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); - size_t batch_size = ids_lod[0].size() - 1; + int64_t batch_size = ids_lod[0].size() - 1; // in run time, the shape from Ids -> output // should be [seq_length, 1] -> [batch_size, embedding_size] - ctx->SetOutputDim("Out", - framework::make_ddim({batch_size, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); } else { // in compile time, the lod level of ids must be 1 - VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); // in compile time, the shape from Ids -> output // should be [-1, 1] -> [-1, embedding_size] - ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); } } diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 5af234b9375..7385c8da334 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,31 +31,38 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -template +template struct EmbeddingVSumFunctor { - void operator()(const DeviceContext &context, LoDTensor *table_t, - LoDTensor *ids_t, LoDTensor *output_t) { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { auto *table = table_t->data(); - int64_t row_number = table->dims()[0]; - int64_t row_width = table->dims()[1]; + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; int64_t *ids = const_cast(ids_t->data()); - auto ids_lod = ids_t->LoD()[0]; + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + auto *output = output_t->mutable_data(context.GetPlace()); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { - size_t begin = ids_lod[i]; + for (int64_t j = 0; j != ids_count; ++j) { + size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); - PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, - output + i * row_width); + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * last_dim + j * row_width); + } - for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width); + output + i * row_width + (r % ids_count) * row_width); } } } @@ -65,14 +72,14 @@ template class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - LoDTensor *ids_t = context.Input("Ids"); // int tensor - LoDTensor *output_t = context.Output("Out"); // float tensor - LoDTensor *table_var = context.Input("W"); + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); const std::string &combiner_type = context.Attr("combiner"); if (combiner_type == "sum") { EmbeddingVSumFunctor functor; - functor(context.template device_context(), ids_t, output_t, table_var); + functor(context, table_var, ids_t, output_t); } } }; @@ -105,7 +112,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); auto lod = ids->lod()[0]; - int64_t row_width = table_dim[1]; + int64_t row_width = d_output->dims()[1]; framework::Vector new_rows; new_rows.resize(ids_num); @@ -113,11 +120,11 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, row_width}); + d_table_value->Resize({ids_num, table_dim[1]}); T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); int64_t in_offset = lod[i] * row_width; -- GitLab From 3d784c27011a127de3c5730d8ee121102fadba6f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:05:18 +0800 Subject: [PATCH 0005/2367] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 6b6b898d4c7..966bdb4df5b 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -35,7 +35,7 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + PADDLE_ENFORCE_GE(ids_dims.size(), 1, "The dim size of the 'Ids' tensor must greater than 1."); PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); -- GitLab From 0f91beefd1f70b1596e657ab4cbf77c3d2c9a574 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:09 +0800 Subject: [PATCH 0006/2367] Fix bug test=develop --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 7385c8da334..f37c6883953 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -53,7 +53,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, + blas.VCOPY(row_width, table + ids[begin + j] * row_width, output + i * last_dim + j * row_width); } @@ -62,7 +62,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width + (r % ids_count) * row_width); + output + i * last_dim + (r % ids_count) * row_width); } } } -- GitLab From 849fbc7327935cfbe43f85744e71db515efa760d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:33 +0800 Subject: [PATCH 0007/2367] Add unittest test=develop --- .../unittests/test_fused_emb_seq_pool_op.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 00000000000..584e309befc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() -- GitLab From b0afdc4e7d57b2122da6484421fde65a10e4c783 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 15:59:34 +0800 Subject: [PATCH 0008/2367] Add CMake deps --- paddle/fluid/operators/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 919ad96f7ad..5e421803c3f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -269,6 +269,7 @@ else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() op_library(hash_op DEPS xxhash) +op_library(fused_hash_embedding_seq_pool DEPS xxhash) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) -- GitLab From 32ebee9f077956046a310d6fe3ad194650f579fa Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 16:05:06 +0800 Subject: [PATCH 0009/2367] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index f37c6883953..38dfae8ad6d 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -48,9 +48,8 @@ struct EmbeddingVSumFunctor { auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; for (int64_t j = 0; j != ids_count; ++j) { - size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); blas.VCOPY(row_width, table + ids[begin + j] * row_width, @@ -114,10 +113,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto lod = ids->lod()[0]; int64_t row_width = d_output->dims()[1]; - framework::Vector new_rows; - new_rows.resize(ids_num); - std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - d_table->set_rows(new_rows); + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_num, table_dim[1]}); -- GitLab From 853878cbf218728608a783260ae74c408ef4b8a2 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Fri, 16 Nov 2018 02:05:56 -0800 Subject: [PATCH 0010/2367] fix the wrong format test=develop --- python/paddle/fluid/average.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py index 42cd3b36420..40a734af311 100644 --- a/python/paddle/fluid/average.py +++ b/python/paddle/fluid/average.py @@ -48,6 +48,7 @@ class WeightedAverage(object): Examples: .. code-block:: python + avg = fluid.average.WeightedAverage() avg.add(value=2.0, weight=1) avg.add(value=4.0, weight=2) -- GitLab From fe915901cdb7c0d55fe13890e3afafcce4cddbf9 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 28 Nov 2018 20:54:46 +0800 Subject: [PATCH 0011/2367] update Opdesc's HasAttr test=develop --- paddle/fluid/framework/op_desc.cc | 11 +++++++++++ paddle/fluid/framework/op_desc.h | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e8ecd905029..a31c5336a1a 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -237,6 +237,17 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } +bool OpDesc::HasAttr(const std::string &name) const { + const proto::OpProto &proto = OpInfoMap::Instance().Get(desc_.type()).Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + return false; +} + proto::AttrType OpDesc::GetAttrType(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 30c8a26c3d2..3da7cdcef39 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -61,9 +61,7 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); - bool HasAttr(const std::string &name) const { - return attrs_.find(name) != attrs_.end(); - } + bool HasAttr(const std::string &name) const; proto::AttrType GetAttrType(const std::string &name) const; -- GitLab From 5db273d8740791622f123fe8c4a6bc3eef4f934d Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 29 Nov 2018 12:05:25 +0800 Subject: [PATCH 0012/2367] enhance HasAttr to fix ci test=develop --- paddle/fluid/framework/op_desc.cc | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index a31c5336a1a..ce7ba967303 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -238,11 +238,21 @@ void OpDesc::SetOutput(const std::string ¶m_name, } bool OpDesc::HasAttr(const std::string &name) const { - const proto::OpProto &proto = OpInfoMap::Instance().Get(desc_.type()).Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; + if (attrs_.find(name) != attrs_.end()) { + return true; + } else { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } } } return false; -- GitLab From 096673f67527b0fed1aab1843041b9d929fd0fb5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 29 Nov 2018 13:20:29 +0000 Subject: [PATCH 0013/2367] refactor eager deletion test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 12 +- .../details/computation_op_handle.cc | 6 +- .../framework/details/computation_op_handle.h | 6 +- .../details/eager_deletion_op_handle.cc | 117 ++++++++++ .../details/eager_deletion_op_handle.h | 64 ++++++ .../framework/details/eager_deletion_pass.cc | 96 ++++++++ .../framework/details/eager_deletion_pass.h | 32 +++ .../details/multi_devices_graph_pass.cc | 6 +- .../details/reference_count_op_handle.h | 138 ------------ .../framework/details/reference_count_pass.cc | 213 +++++------------- .../framework/details/reference_count_pass.h | 5 - .../details/reference_count_pass_helper.h | 49 ++++ .../scope_buffered_ssa_graph_executor.cc | 30 +-- .../scope_buffered_ssa_graph_executor.h | 4 + paddle/fluid/framework/garbage_collector.h | 12 +- paddle/fluid/framework/ir/graph.h | 11 +- paddle/fluid/framework/ir/pass.h | 11 +- paddle/fluid/framework/parallel_executor.cc | 106 ++++++--- paddle/fluid/framework/parallel_executor.h | 24 +- paddle/fluid/platform/CMakeLists.txt | 9 +- .../fluid/platform/stream_callback_manager.cc | 70 ++++++ .../fluid/platform/stream_callback_manager.h | 51 +---- 22 files changed, 631 insertions(+), 441 deletions(-) create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h delete mode 100644 paddle/fluid/framework/details/reference_count_op_handle.h create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.h create mode 100644 paddle/fluid/platform/stream_callback_manager.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fea..8cf97d667d4 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,10 +33,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -if (WITH_GPU) - cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -endif() +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) @@ -44,10 +43,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) -if (WITH_GPU) - list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) -endif() +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600..7beb8c8de9f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -20,11 +20,13 @@ namespace paddle { namespace framework { namespace details { ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, - platform::Place place) + platform::Place place, + size_t scope_idx) : OpHandleBase(node), op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), - place_(place) {} + place_(place), + scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4d..601ae4f8c6d 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,7 +28,8 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t scope_idx); std::string Name() const override; @@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase { void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; } + size_t GetScopeIdx() const { return scope_idx_; } + protected: void RunImpl() override; @@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; + size_t scope_idx_; bool is_lock_and_record_event_free_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc new file mode 100644 index 00000000000..cd262033760 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { +namespace details { + +EagerDeletionOpHandle::EagerDeletionOpHandle( + ir::Node *node, const Scope *scope, const platform::Place &place, + const std::vector &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + dev_ctx_ = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (dynamic_cast *>(gc_)) { + platform::SetDeviceId(boost::get(place).device); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + } +#endif + + for (auto &name : var_names) AddVar(name); +} + +EagerDeletionOpHandle::~EagerDeletionOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + platform::SetDeviceId(gpu_place.device); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif +} + +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } + +void EagerDeletionOpHandle::AddVar(const std::string &name) { + var_names_.insert(name); +} + +void EagerDeletionOpHandle::RunImpl() { + auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + std::vector tensors; + for (auto &name : var_names_) { + auto it = ref_cnts_->find(name); + if (it == ref_cnts_->end()) { + continue; + } + + auto *var = exec_scope->FindVar(name); + if (var == nullptr) { + continue; + } + + if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()->mutable_value()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + tensors.emplace_back(&t); + } + } + } + } + + if (!tensors.empty()) { + ClearTensors(tensors); + } +} + +void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = + static_cast *>(gc_)->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { +#endif + gc_->Add(tensors); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h new file mode 100644 index 00000000000..8254f21bdfc --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +class EagerDeletionPass; + +class EagerDeletionOpHandle : public OpHandleBase { + public: + EagerDeletionOpHandle(ir::Node *node, const Scope *scope, + const platform::Place &place, + const std::vector &var_names, + GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts); + + ~EagerDeletionOpHandle(); + + std::string Name() const override; + + protected: + void RunImpl() override; + + private: + void ClearTensors(const std::vector &tensors); + + void AddVar(const std::string &name); + + const Scope *scope_; + std::unordered_set var_names_; + GarbageCollector *gc_; // not own + AtomicReferenceCountMap *ref_cnts_; // not own +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext *dev_ctx_{nullptr}; + cudaEvent_t event_{nullptr}; +#endif + + friend class EagerDeletionPass; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc new file mode 100644 index 00000000000..f877c2881cd --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, + ir::Graph *graph) { + auto it = std::find_if( + in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != in->Outputs().end()) { + out->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + in->AddOutput(dep_var); + out->AddInput(dep_var); + } + + // Add leaf node to eager_deletion_node + if (out->Outputs().empty()) { + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + out->AddOutput(dummy_leaf); + } +} + +std::unique_ptr EagerDeletionPass::ApplyImpl( + std::unique_ptr graph) const { + auto &vars = graph->Get(kGraphVars); + + auto &ref_cnts = + Get>(kCurReferenceCount); + auto &last_live_ops = Get>(kLastLiveOpsOfVars); + auto &gcs = Get(kGarbageCollector); + + ref_cnts = std::vector(vars.size()); + + std::unordered_map op_map; + for (auto &var_ops_map : last_live_ops) { + for (auto &var_ops_pair : var_ops_map) { + const std::string &var_name = var_ops_pair.first; + for (ComputationOpHandle *op : var_ops_pair.second) { + auto it = op_map.find(op); + if (it != op_map.end()) { + it->second->AddVar(var_name); + } else { + auto *eager_deletion_node = graph->CreateEmptyNode( + "eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, + gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); + AddDependencyBetween(op, eager_deletion_op, graph.get()); + op_map[op] = eager_deletion_op; + } + } + } + } + VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(eager_deletion_pass, + paddle::framework::details::EagerDeletionPass) + .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h new file mode 100644 index 00000000000..d7a7a9709d9 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a36ad259265..97830386e42 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -562,7 +562,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), - local_scopes_[dev_id], places_[dev_id])); + local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } @@ -685,8 +685,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get(kGraphOps).emplace_back( - new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + result->Get(kGraphOps).emplace_back(new ComputationOpHandle( + result->CreateOpNode(node->Op()), s, p, scope_idx)); CreateOpHandleIOs(result, node, scope_idx); } } diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h deleted file mode 100644 index cc4ccfbdfc7..00000000000 --- a/paddle/fluid/framework/details/reference_count_op_handle.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -using ReferenceCountMap = std::unordered_map; -using AtomicReferenceCountMap = - std::unordered_map>; -using DeviceReferenceCountMap = - std::unordered_map>; -using AtomicDeviceReferenceCountMap = - std::unordered_map>; -using DeviceGarbageCollectorMap = - std::unordered_map>>; - -class ReferenceCountOpHandle : public OpHandleBase { - public: - ReferenceCountOpHandle(ir::Node *node, const Scope *scope, - const platform::CUDAPlace &place, - const std::vector &var_names, - GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { - dev_ctx_ = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - if (IsStreamGarabageCollector()) { - platform::SetDeviceId(place.device); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } - - for (auto &name : var_names) AddVar(name); - } - - ~ReferenceCountOpHandle() { - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } - } - - std::string Name() const override { return "reference_count"; } - - void AddVar(const std::string &name) { - auto it = var_names_.find(name); - if (it != var_names_.end()) - ++(it->second); - else - var_names_[name] = 1; - } - - protected: - void RunImpl() override { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; - for (auto &pair : var_names_) { - auto &name = pair.first; - auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) continue; - - auto *var = exec_scope->FindVar(name); - if (var == nullptr) continue; - - if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back(var->GetMutable()); - } - } else if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back( - var->GetMutable()->mutable_value()); - } - } - } - - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors(const std::vector &tensors) { - auto *gc = dynamic_cast *>(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast *>(gc_) != nullptr; - } - - const Scope *scope_; - platform::CUDADeviceContext *dev_ctx_; - std::unordered_map var_names_; - GarbageCollector *gc_; // not own - AtomicReferenceCountMap *ref_cnts_; // not own - cudaEvent_t event_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 08783fb5f8b..f094c7afa9f 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -17,184 +17,96 @@ #include #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { -static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { - std::queue queue; - queue.push(var_in); +static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( + OpHandleBase *op, size_t scope_idx) { + std::queue q; + std::unordered_set visited; + q.push(op); do { - auto *var = queue.front(); - queue.pop(); - for (auto *op : var->PendingOps()) { - auto *compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { - return compute_op; - } - for (auto *out_var : op->Outputs()) { - queue.push(out_var); + auto *op = q.front(); + q.pop(); + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + for (auto *pending_op : out_var->PendingOps()) { + if (visited.count(pending_op)) continue; + visited.insert(pending_op); } } - } while (!queue.empty()); + } while (!q.empty()); return nullptr; } -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &ref_cnts = Get(kGlobalReferenceCount); - auto &cur_ref_cnts = Get(kCurReferenceCount); - auto &gcs = Get(kGarbageCollector); - - // It is not easy to find the right reference counts of varaibles in graph - // Step 1: Find all variables in computation ops - // Step 2: Find all variables in non-computation ops which refers to variables - // in computation ops - std::unordered_set names; - std::unordered_map - compute_ref_cnt_map; - - auto get_ref_cnts_from_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - std::vector var_names_in_op; - auto *compute_op = dynamic_cast(op); - if (compute_op == nullptr || - !platform::is_gpu_place(compute_op->GetPlace())) - return var_names_in_op; - auto place = boost::get(compute_op->GetPlace()); - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - if (!platform::is_gpu_place(var_handle->place_) || - boost::get(var_handle->place_) != place) + auto &vars = graph->Get(kGraphVars); + auto &ref_cnts = Get>(kGlobalReferenceCount); + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars); + + last_live_ops_of_vars = std::vector(vars.size()); + ref_cnts = std::vector(vars.size()); + + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + if (name_var_pair.second.empty()) continue; + auto *last_ver_var = name_var_pair.second.back(); + + VarDesc *var_desc = nullptr; + std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), + [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + + if (var_desc == nullptr || var_desc->Persistable()) { continue; - - VarDesc *var_desc = var_handle->Node()->Var(); - auto var_name = var_handle->Node()->Name(); - - // This is weird but there is really some variables without var_desc - // in computation_op - if (var_desc == nullptr) { - var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); - if (var_desc == nullptr) continue; } - if (var_desc->Persistable()) continue; auto var_type = var_desc->Proto()->type().type(); if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { + var_type != proto::VarType::SELECTED_ROWS && + var_type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - // compute op only runs in one device - if (ref_cnts[place.device]->count(var_name)) - ++(*ref_cnts[place.device])[var_name]; - else - (*ref_cnts[place.device])[var_name] = 1; - - names.insert(var_name); - var_names_in_op.push_back(var_name); - } - return var_names_in_op; - }; - - auto update_ref_cnts_from_non_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - if (dynamic_cast(op) != nullptr) return; - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - auto var_name = var_handle->Node()->Name(); - auto var_place = var_handle->place_; - if (!platform::is_gpu_place(var_place)) continue; - auto place = boost::get(var_place); - if (names.count(var_name) == 0) continue; - if (ref_cnts.count(place.device) && - ref_cnts[place.device]->count(var_name)) { - ++(*ref_cnts[place.device])[var_name]; - - auto *next_compute_op = FindNextComputationOpHandle(var_handle); - if (next_compute_op != nullptr) { - if (compute_ref_cnt_map.count(next_compute_op)) { - compute_ref_cnt_map[next_compute_op]->AddVar(var_name); - VLOG(5) << "Add reference count of " << var_name << " to Operator " - << next_compute_op->Name(); - } else { - // Create new reference_count_op_handle - ir::Node *ref_cnt_node = graph->CreateEmptyNode( - "reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[next_compute_op] = ref_cnt_handle; - } + std::unordered_set last_live_op; + auto add_last_live_op = [&](OpHandleBase *op) { + auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); + if (compute_op) { + last_live_op.insert(compute_op); + } + }; + const std::string &var_name = name_var_pair.first; + auto &pending_ops = last_ver_var->PendingOps(); + if (pending_ops.empty()) { + auto *generated_op = last_ver_var->GeneratedOp(); + if (generated_op) { + ref_cnts[i].emplace(var_name, 1); + add_last_live_op(generated_op); + } + } else { + ref_cnts[i].emplace(var_name, pending_ops.size()); + for (auto *pending_op : pending_ops) { + add_last_live_op(pending_op); } } - } - }; - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto &op : all_ops) { - auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); - auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); - if (in_var_names.empty() && out_var_names.empty()) continue; - in_var_names.insert(in_var_names.end(), out_var_names.begin(), - out_var_names.end()); - auto *compute_op = dynamic_cast(op); - auto place = boost::get(compute_op->GetPlace()); - ir::Node *ref_cnt_node = - graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, compute_op->GetScope(), place, in_var_names, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[compute_op] = ref_cnt_handle; - } - - for (auto &op : all_ops) { - update_ref_cnts_from_non_compute_op(op, op->Inputs()); - update_ref_cnts_from_non_compute_op(op, op->Outputs()); - } - - std::vector new_all_ops; - new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); - for (auto &op : all_ops) { - new_all_ops.emplace_back(std::move(op)); - auto it = compute_ref_cnt_map.find(new_all_ops.back()); - if (it != compute_ref_cnt_map.end()) { - // Add LeafNode to ReferenceCountOpHandle - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - it->second->AddOutput(dummy_leaf); - new_all_ops.emplace_back(std::move(it->second)); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } } - - all_ops.swap(new_all_ops); return graph; } @@ -205,5 +117,4 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( REGISTER_PASS(reference_count_pass, paddle::framework::details::ReferenceCountPass) .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) - .RequirePassAttr(paddle::framework::details::kGarbageCollector); + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars); diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index 7081280b060..bcbef027354 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -14,7 +14,6 @@ #pragma once -#include "paddle/fluid/framework/details/reference_count_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -22,10 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kGlobalReferenceCount[] = "reference_count"; -constexpr char kCurReferenceCount[] = "current_reference_count"; -constexpr char kGarbageCollector[] = "garbage_collector"; - class ReferenceCountPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h new file mode 100644 index 00000000000..77846f7bdfc --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ComputationOpHandle; + +using ReferenceCountMap = std::unordered_map; + +using AtomicReferenceCountMap = + std::unordered_map>; + +using GarbageCollectorList = + std::vector>>; + +const char kGlobalReferenceCount[] = "reference_count"; +const char kCurReferenceCount[] = "current_reference_count"; +const char kGarbageCollector[] = "garbage_collector"; + +using LastLiveOpsOfVars = + std::unordered_map>; +const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index e5b1eaa7318..f1bf6542a30 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -18,9 +18,6 @@ #include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_op_handle.h" -#endif namespace paddle { namespace framework { @@ -33,7 +30,11 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) {} + places_(std::move(places)) { + if (Graph().Has(details::kGarbageCollector)) { + gc_ = &(Graph().Get(details::kGarbageCollector)); + } +} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -69,27 +70,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; -#ifdef PADDLE_WITH_CUDA - const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; -#endif - if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); -#ifdef PADDLE_WITH_CUDA - if (gc != nullptr && platform::is_gpu_place(p)) { - auto gpu_place = boost::get(p); - auto &gc_at_place = gc->at(gpu_place.device); - gc_at_place->Wait(); - gc_at_place->Reset(); + for (size_t i = 0; i < places_.size(); ++i) { + platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); + if (gc_) { + (*gc_)[i]->Wait(); + (*gc_)[i]->Reset(); } -#endif } for (auto &scope : local_scopes_) { auto &local_scope = diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50b..ce3061d6e61 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,9 +21,11 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" + namespace paddle { namespace framework { namespace details { @@ -55,6 +57,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; + + GarbageCollectorList* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 818b3334ea4..cbe8f606efe 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -65,7 +65,7 @@ class GarbageCollector { if (clear_deque != nullptr) { callback(); - ClearCallback([=]() { + ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); }); } @@ -109,7 +109,6 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - this->dev_ctx_->Wait(); static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -127,14 +126,14 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } @@ -148,8 +147,11 @@ class StreamGarbageCollector : public GarbageCollector { cudaStream_t stream() const { return stream_; } protected: + // ClearCallback and Wait()/Reset() cannot be call in multiple threads + // But it is not important, because they would not be called in multiple + // threads + // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { - std::lock_guard guard(this->mutex_); callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 947c934f0ff..7a2560c14df 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -73,14 +73,21 @@ class Graph { } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } template AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } template diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index a3559247db6..27746ff1453 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -51,11 +51,18 @@ class Pass { AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), "%s attr not registered for pass.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } void Erase(const std::string &attr_name) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee772..e71f93beefc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -49,6 +50,15 @@ class ParallelExecutorPrivate { } } } + + void ResetRuntimeReferenceCount() { + for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { + for (auto &pair : rt_ref_cnts_[i]) { + rt_cur_ref_cnts_[i][pair.first] = pair.second; + } + } + } + std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -60,6 +70,13 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + + // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then + // keeps unchanged + // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ + std::vector rt_ref_cnts_; + std::vector rt_cur_ref_cnts_; + details::GarbageCollectorList gcs_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -128,35 +145,56 @@ ParallelExecutor::ParallelExecutor( std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); +#else + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); +#endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - for (auto &place : member_->places_) { - if (!platform::is_gpu_place(place)) continue; - auto gpu_place = boost::get(place); - if (gcs_[gpu_place.device] == nullptr) { - ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap()); - cur_ref_cnts_[gpu_place.device].reset( - new details::AtomicReferenceCountMap()); - gcs_[gpu_place.device].reset( - new StreamGarbageCollector(gpu_place, max_memory_size)); + size_t place_num = member_->places_.size(); + for (size_t i = 0; i < place_num; ++i) { + auto &place = member_->places_[i]; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + member_->gcs_.emplace_back(new CPUGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA } - } - if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); +#endif } } -#else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - params, member_->local_scopes_, member_->use_cuda_); -#endif + + if (!member_->gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &(member_->rt_ref_cnts_)); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + VLOG(10) << "ReferenceCountPass Applied"; + graph = ref_cnt_pass->Apply(std::move(graph)); + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, + &(member_->rt_cur_ref_cnts_)); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, + &(member_->gcs_)); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars @@ -271,18 +309,16 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); -#ifdef PADDLE_WITH_CUDA - if (!gcs_.empty()) { - ResetReferenceCount(); - for (auto &pair : cur_ref_cnts_) { - auto &name_map = *(pair.second); + if (!member_->gcs_.empty()) { + member_->ResetRuntimeReferenceCount(); + size_t n = member_->rt_ref_cnts_.size(); + for (size_t i = 0; i < n; ++i) { for (auto &fetch_name : fetch_tensors) { - name_map.erase(fetch_name); + member_->rt_cur_ref_cnts_[i].erase(fetch_name); } - name_map.erase(fetched_var_name); + member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); } } -#endif auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; @@ -326,13 +362,11 @@ ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - // member_ must be destructed before gcs_ since the destructor of - // ReferenceCountOpHandle use raw pointers of gcs_ inside. - member_.reset(); + delete member_; } } // namespace framework } // namespace paddle -#ifdef PADDLE_WITH_CUDA + USE_PASS(reference_count_pass); -#endif +USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2aa..1fc17a0d64d 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include #include @@ -29,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_pass.h" -#endif - namespace paddle { namespace framework { @@ -75,24 +70,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - std::unique_ptr member_; - -#ifdef PADDLE_WITH_CUDA - // ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_ - details::DeviceReferenceCountMap ref_cnts_; - details::AtomicDeviceReferenceCountMap cur_ref_cnts_; - details::DeviceGarbageCollectorMap gcs_; - - void ResetReferenceCount() { - for (auto &pair1 : ref_cnts_) { - for (auto &pair2 : *(pair1.second)) { - (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second; - } - } - } -#endif + ParallelExecutorPrivate *member_; }; } // namespace framework diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 93cb5eb2dc0..23c7ebe8422 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -56,9 +56,16 @@ ELSE() set(MKLDNN_CTX_DEPS) ENDIF() +nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +IF(WITH_GPU) + set(STREAM_CALLBACK_DEPS stream_callback_manager) +ELSE() + set(STREAM_CALLBACK_DEPS) +ENDIF() + # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc new file mode 100644 index 00000000000..ae915365f8c --- /dev/null +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/stream_callback_manager.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +struct StreamCallbackContext { + inline StreamCallbackContext(const StreamCallbackManager *manager, + std::function callback) + : manager_(manager), callback_(std::move(callback)) {} + + const StreamCallbackManager *manager_; // do not own + std::function callback_; +}; + +StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) + : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + +void StreamCallbackManager::AddCallback(std::function callback) const { + auto *stream_callback_context = + new StreamCallbackContext(this, std::move(callback)); +#if CUDA_VERSION >= 10000 + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); +#else + PADDLE_ENFORCE( + cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); +#endif +} + +void StreamCallbackManager::Wait() const { + thread_pool_.reset(new ::ThreadPool(1)); +} + +#if CUDA_VERSION >= 10000 +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) +#else +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, + void *user_data) +#endif +{ + auto *callback_context_ptr = + reinterpret_cast(user_data); + callback_context_ptr->manager_->thread_pool_->enqueue( + [callback_context_ptr]() { + std::unique_ptr callback_context( + callback_context_ptr); + callback_context->callback_(); + }); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index ed8734c98cb..eac4806d137 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -19,66 +19,29 @@ #include #include #include -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -class StreamCallbackManager; - -struct StreamCallbackContext { - template - inline StreamCallbackContext(const StreamCallbackManager *manager, - Callback &&callback) - : manager_(manager), callback_(callback) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; - +// NOTE(zjl): clean StreamCallback to make compilation faster class StreamCallbackManager { public: - explicit inline StreamCallbackManager(cudaStream_t stream = nullptr) - : stream_(stream), thread_pool_(new ThreadPool(1)) {} + explicit StreamCallbackManager(const cudaStream_t stream); - template - inline void AddCallback(Callback &&callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::forward(callback)); -#if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); // NOLINT -#else - PADDLE_ENFORCE(cudaStreamAddCallback( - stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); // NOLINT -#endif - } + void AddCallback(std::function callback) const; - void Wait() const { thread_pool_.reset(new ThreadPool(1)); } + void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr thread_pool_; + mutable std::unique_ptr<::ThreadPool> thread_pool_; -// cudaStreamCallback cannot call CUDA API inside, so we have to use -// thread_pool here #if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data) + static void CUDART_CB StreamCallbackFunc(void *user_data); #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data) + cudaError_t status, void *user_data); #endif - { - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue([=]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); - } }; } // namespace platform -- GitLab From 3c239cd640aca1fa8da71a9cdc319b8b4e4fb36c Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 13:10:21 +0800 Subject: [PATCH 0014/2367] pslib --- CMakeLists.txt | 1 + cmake/external/pslib.cmake | 76 ++++++++++++++++++++++++ paddle/fluid/framework/async_executor.cc | 1 + 3 files changed, 78 insertions(+) create mode 100644 cmake/external/pslib.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba24..5251fe286f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,6 +186,7 @@ endif() ######################################################################################## include(external/mklml) # download mklml package +include(external/pslib) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake new file mode 100644 index 00000000000..812af5efa20 --- /dev/null +++ b/cmake/external/pslib.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB}) + return() +ENDIF(NOT ${WITH_PSLIB}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_PROJECT "extern_pslib") +IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_VER "pslib" CACHE STRING "" FORCE) #todo pslib version + SET(PSLIB_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib url +ENDIF() +MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") +SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") +SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") +SET(PSLIB_DST_DIR "pslib") +SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) + +FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_VER}/include ${PSLIB_VER}/lib \n" + " DESTINATION ${PSLIB_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_VER}.tar.gz + && tar zxvf ${PSLIB_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) +LIST(APPEND external_project_dependencies pslib) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index afb2dd2f064..aa76e03e838 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#include "pslib.h" namespace paddle { namespace framework { -- GitLab From 0e4709daddaf76e71a2de3f7490184453b2c1e17 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 1 Dec 2018 13:14:03 +0800 Subject: [PATCH 0015/2367] add mpi4py helper --- python/paddle/fluid/distributed/helper.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 python/paddle/fluid/distributed/helper.py diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py new file mode 100644 index 00000000000..8e079b1e8d9 --- /dev/null +++ b/python/paddle/fluid/distributed/helper.py @@ -0,0 +1,20 @@ +from mpi4py import MPI + +class MPIHelper(object): + def __init__(self): + self.comm = MPI.COMM_WORLD + + def get_rank(self): + return self.comm.Get_rank() + + def get_size(self): + return self.comm.Get_size() + + def get_ip(self): + import socket + local_ip = socket.gethostbyname(socket.gethostname()) + return local_ip + + def get_hostname(self): + import socket + return socket.gethostname() -- GitLab From 038346c0c2053bbc0b051e7bb48de42d61af6958 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 13:52:02 +0800 Subject: [PATCH 0016/2367] libmct --- cmake/external/libmct.cmake | 76 +++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 cmake/external/libmct.cmake diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake new file mode 100644 index 00000000000..351806f6e1a --- /dev/null +++ b/cmake/external/libmct.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_LIBMCT}) + return() +ENDIF(NOT ${WITH_LIBMCT}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(LIBMCT_PROJECT "extern_libmct") +IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(LIBMCT_VER "libmct" CACHE STRING "" FORCE) #todo libmct version + SET(LIBMCT_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${LIBMCT_VER}.tar.gz" CACHE STRING "" FORCE) #todo libmct url +ENDIF() +MESSAGE(STATUS "LIBMCT_VER: ${LIBMCT_VER}, LIBMCT_URL: ${LIBMCT_URL}") +SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") +SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") +SET(LIBMCT_DST_DIR "libmct") +SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +SET(LIBMCT_LIB_DIR ${LIBMCT_ROOT}/lib) +SET(LIBMCT_LIB ${LIBMCT_LIB_DIR}/libps.so) +SET(LIBMCT_IOMP_LIB ${LIBMCT_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") + +INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) + +FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${LIBMCT_VER}/include ${LIBMCT_VER}/lib \n" + " DESTINATION ${LIBMCT_DST_DIR})\n") + +ExternalProject_Add( + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_SOURCE_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_VER}.tar.gz + && tar zxvf ${LIBMCT_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} +) + +ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET libmct PROPERTY IMPORTED_LOCATION ${LIBMCT_LIB}) +ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) +LIST(APPEND external_project_dependencies libmct) + +IF(WITH_C_API) + INSTALL(FILES ${LIBMCT_LIB} ${LIBMCT_IOMP_LIB} DESTINATION lib) +ENDIF() -- GitLab From 4798a8c7b848891c18cd5b23e8023b88d9f32643 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 14:51:40 +0800 Subject: [PATCH 0017/2367] pslib_brpc --- cmake/external/pslib_brpc.cmake | 76 +++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 cmake/external/pslib_brpc.cmake diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake new file mode 100644 index 00000000000..7b4beeae65a --- /dev/null +++ b/cmake/external/pslib_brpc.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB_BRPC}) + return() +ENDIF(NOT ${WITH_PSLIB_BRPC}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +IF((NOT DEFINED PSLIB_BRPC_VER) OR (NOT DEFINED PSLIB_BRPC_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_BRPC_VER "pslib_brpc" CACHE STRING "" FORCE) #todo pslib version + SET(PSLIB_BRPC_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_BRPC_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib_brpc url +ENDIF() +MESSAGE(STATUS "PSLIB_BRPC_VER: ${PSLIB_BRPC_VER}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") +SET(PSLIB_BRPC_DST_DIR "pslib_brpc") +SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libps.so) +SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) + +FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_BRPC_VER}/include ${PSLIB_BRPC_VER}/lib \n" + " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_VER}.tar.gz + && tar zxvf ${PSLIB_BRPC_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) +LIST(APPEND external_project_dependencies pslib_brpc) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib) +ENDIF() -- GitLab From 7f07dfa1a4cdadbdfd4b24d342dafefb316ca7c1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 1 Dec 2018 15:30:26 +0800 Subject: [PATCH 0018/2367] clean code --- .../fluid/operators/reader/ctr_reader_test.cc | 2 +- .../reader/lod_tensor_blocking_queue.h | 12 ++++-------- paddle/fluid/pybind/pybind.cc | 19 ++++++------------- python/paddle/fluid/layers/io.py | 2 +- 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 8dba9baebce..5e672e9aa18 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -126,7 +126,7 @@ TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, false); + queue_holder.InitOnce(capacity, false); std::shared_ptr queue = queue_holder.GetQueue(); diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 3f041ff7e4e..5b53edff5d8 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -32,10 +32,8 @@ class LoDTensorBlockingQueue { friend class LoDTensorBlockingQueueHolder; private: - LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims, - bool speed_test_mode = false) - : queue_(capacity, speed_test_mode), dims_(dims) {} + explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false) + : queue_(capacity, speed_test_mode) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -65,17 +63,15 @@ class LoDTensorBlockingQueue { private: BlockingQueue> queue_; - std::vector dims_; }; class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims, - bool speed_test_mode = false) { + void InitOnce(size_t capacity, bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); + queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d2974..f0a5d1afc97 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -384,19 +384,12 @@ All parameter, weight, gradient are variables in Paddle. .def("is_closed", &LoDTensorBlockingQueue::IsClosed); m.def("init_lod_tensor_blocking_queue", - [](Variable &var, size_t capacity, - const std::vector> &shapes) - -> std::shared_ptr { - std::vector dims(shapes.size()); - std::transform(shapes.begin(), shapes.end(), dims.begin(), - [](const std::vector &shape) { - return make_ddim(shape); - }); - auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims, - FLAGS_reader_queue_speed_test_mode); - return holder->GetQueue(); - }, + [](Variable &var, + size_t capacity) -> std::shared_ptr { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return holder->GetQueue(); + }, py::return_value_policy::copy); py::class_(m, "Scope", R"DOC( diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 3f47053961b..3016d8e3a4c 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -523,7 +523,7 @@ def _py_reader(capacity, double_buffer_name = "_".join([name, "double_buffer"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) -- GitLab From 978fd6800cb05ddbf7d912aa2ba776e29647b8ac Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 1 Dec 2018 16:49:04 +0800 Subject: [PATCH 0019/2367] update ctr_reader.py --- python/paddle/fluid/contrib/reader/ctr_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index b8449e8d848..d7133562de4 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -90,7 +90,7 @@ def ctr_reader(feed_data, reader_name = "_".join([name, "reader"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() reader_var = startup_blk.create_var(name=reader_name) -- GitLab From 52a0be7bb437e574d7fda8d322c816e91029e438 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 1 Dec 2018 13:54:44 +0800 Subject: [PATCH 0020/2367] add mct into CMakeLists.txt --- CMakeLists.txt | 5 ++++- paddle/fluid/framework/async_executor.h | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5251fe286f8..8c929396fff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,6 @@ endif() ######################################################################################## include(external/mklml) # download mklml package -include(external/pslib) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib @@ -217,6 +216,9 @@ include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) +include(external/libmct) +include(external/pslib_brpc) +include(external/pslib) if(WITH_DISTRIBUTE) if(WITH_GRPC) @@ -277,6 +279,7 @@ set(EXTERNAL_LIBS protobuf zlib ${PYTHON_LIBRARIES} + pslib ) if(WITH_AMD_GPU) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index f4d2a79ac59..6aa59c89dc4 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -40,6 +40,9 @@ class AsyncExecutor { const int thread_num, const std::vector& fetch_names, const bool debug = false); + void ConfigServer() {} + void ConfigWorker() {} + void StartServer() {} private: void CreateThreads(ExecutorThreadWorker* worker, -- GitLab From 2cd25794bd186600e79ac94eb9a93593e1d5fbb1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 11:47:37 +0800 Subject: [PATCH 0021/2367] add PlainFileReader --- paddle/fluid/operators/reader/ctr_reader.cc | 26 +++++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index d1d3ddc89dc..e2f8788a9a8 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -95,11 +95,27 @@ class GzipReader : public Reader { igzstream gzstream_; }; -class MultiGzipReader : public Reader { +class PlainFileReader : public Reader { public: - explicit MultiGzipReader(const std::vector& file_list) { + explicit PlainFileReader(const std::string& file_name) + : myfile_(file_name.c_str()) {} + + ~PlainFileReader() {} + + bool HasNext() override { return myfile_.peek() != EOF; } + + void NextLine(std::string* line) override { std::getline(myfile_, *line); } + + private: + std::ifstream myfile_; +}; + +template +class MultiFileReader : public Reader { + public: + explicit MultiFileReader(const std::vector& file_list) { for (auto& file : file_list) { - readers_.emplace_back(std::make_shared(file)); + readers_.emplace_back(std::make_shared(file)); } } @@ -119,7 +135,7 @@ class MultiGzipReader : public Reader { } private: - std::vector> readers_; + std::vector> readers_; size_t current_reader_index_ = 0; }; @@ -166,7 +182,7 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiGzipReader reader(file_list); + MultiFileReader reader(file_list); VLOG(30) << "reader inited"; -- GitLab From a05a948d89cb3abdb7f60c8ffbf74fdd59b35a7b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 13:07:06 +0800 Subject: [PATCH 0022/2367] update readthread --- .../operators/reader/create_ctr_reader_op.cc | 24 ++++++--- paddle/fluid/operators/reader/ctr_reader.cc | 53 +++++++++++-------- paddle/fluid/operators/reader/ctr_reader.h | 29 +++++++--- .../fluid/operators/reader/ctr_reader_test.cc | 15 +++--- 4 files changed, 77 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 58a465d87a8..e66263fee11 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -41,13 +41,16 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder = queue_holder_var->template GetMutable(); - int thread_num = Attr("thread_num"); - std::vector slots = Attr>("slots"); - int batch_size = Attr("batch_size"); - std::vector file_list = - Attr>("file_list"); - out->Reset(std::make_shared(queue_holder->GetQueue(), batch_size, - thread_num, slots, file_list)); + auto thread_num = Attr("thread_num"); + auto sparse_slots = Attr>("sparse_slots"); + auto dense_slots = Attr>("dense_slots"); + auto batch_size = Attr("batch_size"); + auto file_type = Attr("file_type"); + auto file_format = Attr("file_format"); + auto file_list = Attr>("file_list"); + out->Reset(std::make_shared( + queue_holder->GetQueue(), batch_size, thread_num, file_type, + file_format, dense_slots, sparse_slots, file_list)); } }; @@ -58,10 +61,15 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddAttr("thread_num", "the thread num to read data"); AddAttr("batch_size", "the batch size of read data"); + AddAttr("file_type", "plain or gzip").SetDefault("plain"); + AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); AddAttr>( - "slots", "the slots that should be extract from file"); + "dense_slots", "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>( + "sparse_slots", "the sparse slots id that should be extract from file"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index e2f8788a9a8..09939576568 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -141,40 +141,42 @@ class MultiFileReader : public Reader { void MonitorThread(std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "monitor thread in"; + VLOG(3) << "monitor thread in"; bool reader_thread_is_running = true; while (reader_thread_is_running) { - VLOG(30) << "reader_thread_is_running"; + VLOG(3) << "reader_thread_is_running"; reader_thread_is_running = false; for (size_t i = 0; i < (*thread_status).size(); ++i) { if ((*thread_status)[i] == Running) { - VLOG(30) << "reader is running!"; + VLOG(3) << "reader is running!"; reader_thread_is_running = true; } } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(30) << "all reader thread is stopped, push empty data into queue"; + VLOG(3) << "all reader thread is stopped, push empty data into queue"; queue->Push({}); - VLOG(30) << "monitor thread exited"; + VLOG(3) << "monitor thread exited"; } void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; for (auto& file : file_list) { - VLOG(30) << "[" << thread_id << "]" - << " file " << file; + VLOG(3) << "[" << thread_id << "]" + << " file " << file; } (*thread_status)[thread_id] = Running; - VLOG(30) << "set status to running"; + VLOG(3) << "set status to running"; std::unordered_map slot_to_index; - for (size_t i = 0; i < slots.size(); ++i) { - slot_to_index[slots[i]] = i; + for (size_t i = 0; i < sparse_slots.size(); ++i) { + slot_to_index[sparse_slots[i]] = i; } std::string line; @@ -182,11 +184,18 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiFileReader reader(file_list); + std::unique_ptr reader; + if (file_type == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (file_type == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", file_type); + } - VLOG(30) << "reader inited"; + VLOG(3) << "reader inited"; - while (reader.HasNext()) { + while (reader->HasNext()) { batch_data.clear(); batch_data.reserve(batch_size); @@ -195,8 +204,8 @@ void ReadThread(const std::vector& file_list, // read batch_size data for (int i = 0; i < batch_size; ++i) { - if (reader.HasNext()) { - reader.NextLine(&line); + if (reader->HasNext()) { + reader->NextLine(&line); std::unordered_map> slot_to_data; int64_t label; parse_line(line, slot_to_index, &label, &slot_to_data); @@ -209,8 +218,8 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; - // first insert tensor for each slots - for (auto& slot : slots) { + // first insert tensor for each sparse_slots + for (auto& slot : sparse_slots) { std::vector lod_data{0}; std::vector batch_feasign; @@ -242,11 +251,11 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(label_tensor); queue->Push(lod_datas); - VLOG(40) << "push one data, queue_size=" << queue->Size(); + VLOG(4) << "push one data, queue_size=" << queue->Size(); } (*thread_status)[thread_id] = Stopped; - VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; + VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae12..68d587bbfc4 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -36,7 +36,9 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue); @@ -47,11 +49,18 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, - const std::vector& slots, - const std::vector& file_list) - : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + CTRReader(const std::shared_ptr& queue, + int batch_size, int thread_num, const std::string& file_type, + const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, + const std::vector& file_list) + : batch_size_(batch_size), + file_type_(file_type), + file_format_(file_format), + dense_slots_(dense_slots), + sparse_slots_(sparse_slots), + file_list_(file_list) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); @@ -97,7 +106,8 @@ class CTRReader : public framework::FileReader { VLOG(3) << "thread_num " << thread_num_; for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, + std::bind(&ReadThread, file_groups_[thread_id], file_type_, + file_format_, dense_slots_, sparse_slots_, batch_size_, thread_id, &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( @@ -119,7 +129,10 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; const int batch_size_; - const std::vector slots_; + const std::string file_type_; + const std::string file_format_; + const std::vector dense_slots_; + const std::vector sparse_slots_; const std::vector file_list_; std::shared_ptr queue_; std::vector> read_threads_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 5e672e9aa18..734bf45383c 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -132,24 +132,27 @@ TEST(CTR_READER, read_data) { int batch_size = 3; int thread_num = 1; - std::vector slots = {"6002", "6003"}; + std::vector sparse_slots = {"6002", "6003"}; std::vector file_list; for (int i = 0; i < thread_num; ++i) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, slots, file_list); + CTRReader reader(queue, batch_size, thread_num, "gzip", "plain", {}, + sparse_slots, file_list); reader.Start(); size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); reader.Start(); - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); } -- GitLab From d7c8ebac2eafc87e887dcf9f4e38b9d3f7661d1d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 14:40:43 +0800 Subject: [PATCH 0023/2367] add datadesc --- .../operators/reader/create_ctr_reader_op.cc | 25 ++++--- paddle/fluid/operators/reader/ctr_reader.cc | 39 +++++++---- paddle/fluid/operators/reader/ctr_reader.h | 68 +++++++++++-------- .../fluid/operators/reader/ctr_reader_test.cc | 7 +- 4 files changed, 87 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index e66263fee11..5b9e2ba693f 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -43,14 +43,16 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto thread_num = Attr("thread_num"); auto sparse_slots = Attr>("sparse_slots"); - auto dense_slots = Attr>("dense_slots"); + auto dense_slot_index = Attr>("dense_slot_index"); + auto sparse_slot_index = Attr>("sparse_slot_index"); auto batch_size = Attr("batch_size"); auto file_type = Attr("file_type"); auto file_format = Attr("file_format"); auto file_list = Attr>("file_list"); - out->Reset(std::make_shared( - queue_holder->GetQueue(), batch_size, thread_num, file_type, - file_format, dense_slots, sparse_slots, file_list)); + DataDesc data_desc(batch_size, file_list, file_type, file_format, + dense_slot_index, sparse_slot_index, sparse_slots); + out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, + data_desc)); } }; @@ -65,11 +67,18 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); - AddAttr>( - "dense_slots", "the sparse slots id that should be extract from file") + AddAttr>( + "dense_slot_index", + "the sparse slots id that should be extract from file") .SetDefault({}); - AddAttr>( - "sparse_slots", "the sparse slots id that should be extract from file"); + AddAttr>( + "dense_slot_index", + "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>("sparse_slots", + "the sparse slots id that should be " + "extract from file, used when file " + "format is svm"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 09939576568..0af55b503e2 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -73,6 +73,21 @@ static inline void parse_line( } } +// label slot1:fea_sign slot2:fea_sign slot1:fea_sign +static inline void parse_svm_line(const std::string& line) {} + +// label,dense_fea,dense_fea,sparse_fea,sparse_fea +static inline void parse_csv_line(const std::string& line, + const std::vector& dense_slots, + const std::vector& sparse_slots, + int64_t* label, + std::vector* dense_datas, + std::vector* sparse_datas) { + std::vector ret; + string_split(line, ',', &ret); + *label = std::stoi(ret[2]) > 0; +} + class Reader { public: virtual ~Reader() {} @@ -160,10 +175,8 @@ void MonitorThread(std::vector* thread_status, } void ReadThread(const std::vector& file_list, - const std::string& file_type, const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue) { VLOG(3) << "[" << thread_id << "]" << " reader thread start! thread_id = " << thread_id; @@ -175,8 +188,8 @@ void ReadThread(const std::vector& file_list, VLOG(3) << "set status to running"; std::unordered_map slot_to_index; - for (size_t i = 0; i < sparse_slots.size(); ++i) { - slot_to_index[sparse_slots[i]] = i; + for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { + slot_to_index[data_desc.sparse_slot_ids_[i]] = i; } std::string line; @@ -185,25 +198,25 @@ void ReadThread(const std::vector& file_list, std::vector batch_label; std::unique_ptr reader; - if (file_type == "gzip") { + if (data_desc.file_type_ == "gzip") { reader.reset(new MultiFileReader(file_list)); - } else if (file_type == "plain") { + } else if (data_desc.file_type_ == "plain") { reader.reset(new MultiFileReader(file_list)); } else { - PADDLE_THROW("do not support file format %s", file_type); + PADDLE_THROW("do not support file format %s", data_desc.file_type_); } VLOG(3) << "reader inited"; while (reader->HasNext()) { batch_data.clear(); - batch_data.reserve(batch_size); + batch_data.reserve(data_desc.batch_size_); batch_label.clear(); - batch_label.reserve(batch_size); + batch_label.reserve(data_desc.batch_size_); // read batch_size data - for (int i = 0; i < batch_size; ++i) { + for (int i = 0; i < data_desc.batch_size_; ++i) { if (reader->HasNext()) { reader->NextLine(&line); std::unordered_map> slot_to_data; @@ -219,7 +232,7 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; // first insert tensor for each sparse_slots - for (auto& slot : sparse_slots) { + for (auto& slot : data_desc.sparse_slot_ids_) { std::vector lod_data{0}; std::vector batch_feasign; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 68d587bbfc4..1f4663e3b89 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -35,11 +35,34 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; +struct DataDesc { + DataDesc(int batch_size, const std::vector& file_names, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slot_index, + const std::vector& sparse_slot_index, + const std::vector& sparse_slot_ids) + : batch_size_(batch_size), + file_names_(file_names), + file_type_(file_type), + file_format_(file_format), + dense_slot_index_(dense_slot_index), + sparse_slot_index_(sparse_slot_index), + sparse_slot_ids_(sparse_slot_ids) {} + + const int batch_size_; + const std::vector file_names_; + const std::string file_type_; // gzip or plain + const std::string file_format_; // csv or svm + // used for csv data format + const std::vector dense_slot_index_; + const std::vector sparse_slot_index_; + // used for svm data format + const std::vector sparse_slot_ids_; +}; + void ReadThread(const std::vector& file_list, - const std::string& file_type, const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue); // monitor all running thread, if they are all stopped, @@ -50,22 +73,15 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, const std::string& file_type, - const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, - const std::vector& file_list) - : batch_size_(batch_size), - file_type_(file_type), - file_format_(file_format), - dense_slots_(dense_slots), - sparse_slots_(sparse_slots), - file_list_(file_list) { + int thread_num, const DataDesc& data_desc) + : data_desc_(data_desc) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0, + "file list should not be empty"); + thread_num_ = data_desc_.file_names_.size() > thread_num + ? thread_num + : data_desc_.file_names_.size(); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -106,9 +122,8 @@ class CTRReader : public framework::FileReader { VLOG(3) << "thread_num " << thread_num_; for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], file_type_, - file_format_, dense_slots_, sparse_slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + std::bind(&ReadThread, file_groups_[thread_id], data_desc_, thread_id, + &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); @@ -118,8 +133,8 @@ class CTRReader : public framework::FileReader { private: void SplitFiles() { file_groups_.resize(thread_num_); - for (size_t i = 0; i < file_list_.size(); ++i) { - auto& file_name = file_list_[i]; + for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) { + auto& file_name = data_desc_.file_names_[i]; std::ifstream f(file_name.c_str()); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); file_groups_[i % thread_num_].push_back(file_name); @@ -128,12 +143,7 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; - const int batch_size_; - const std::string file_type_; - const std::string file_format_; - const std::vector dense_slots_; - const std::vector sparse_slots_; - const std::vector file_list_; + const DataDesc data_desc_; std::shared_ptr queue_; std::vector> read_threads_; std::unique_ptr monitor_thread_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 734bf45383c..a14e21bc8d2 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -36,6 +36,7 @@ using paddle::framework::LoD; using paddle::framework::DDim; using paddle::platform::CPUPlace; using paddle::framework::make_ddim; +using paddle::operators::reader::DataDesc; static void generatedata(const std::vector& data, const std::string& file_name) { @@ -138,8 +139,10 @@ TEST(CTR_READER, read_data) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, "gzip", "plain", {}, - sparse_slots, file_list); + DataDesc data_desc(batch_size, file_list, "gzip", "plain", {}, {}, + sparse_slots); + + CTRReader reader(queue, thread_num, data_desc); reader.Start(); size_t batch_num = -- GitLab From fbd6f50148bb7eaf40ced1964737b2550ab746a1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 14:55:35 +0800 Subject: [PATCH 0024/2367] add ReadSvmData --- paddle/fluid/operators/reader/ctr_reader.cc | 67 +++++++++++-------- .../fluid/operators/reader/ctr_reader_test.cc | 2 +- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 0af55b503e2..9834d7183a3 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -78,14 +78,18 @@ static inline void parse_svm_line(const std::string& line) {} // label,dense_fea,dense_fea,sparse_fea,sparse_fea static inline void parse_csv_line(const std::string& line, - const std::vector& dense_slots, - const std::vector& sparse_slots, - int64_t* label, + const DataDesc& data_desc, int64_t* label, std::vector* dense_datas, std::vector* sparse_datas) { std::vector ret; string_split(line, ',', &ret); - *label = std::stoi(ret[2]) > 0; + *label = std::stol(ret[2]) > 0; + for (auto& idx : data_desc.dense_slot_index_) { + dense_datas->push_back(std::stof(ret[idx])); + } + for (auto& idx : data_desc.sparse_slot_index_) { + sparse_datas->push_back(std::stol(ret[idx])); + } } class Reader { @@ -174,19 +178,8 @@ void MonitorThread(std::vector* thread_status, VLOG(3) << "monitor thread exited"; } -void ReadThread(const std::vector& file_list, - const DataDesc& data_desc, int thread_id, - std::vector* thread_status, - std::shared_ptr queue) { - VLOG(3) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; - for (auto& file : file_list) { - VLOG(3) << "[" << thread_id << "]" - << " file " << file; - } - (*thread_status)[thread_id] = Running; - VLOG(3) << "set status to running"; - +void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { std::unordered_map slot_to_index; for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { slot_to_index[data_desc.sparse_slot_ids_[i]] = i; @@ -197,17 +190,6 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - std::unique_ptr reader; - if (data_desc.file_type_ == "gzip") { - reader.reset(new MultiFileReader(file_list)); - } else if (data_desc.file_type_ == "plain") { - reader.reset(new MultiFileReader(file_list)); - } else { - PADDLE_THROW("do not support file format %s", data_desc.file_type_); - } - - VLOG(3) << "reader inited"; - while (reader->HasNext()) { batch_data.clear(); batch_data.reserve(data_desc.batch_size_); @@ -266,6 +248,35 @@ void ReadThread(const std::vector& file_list, queue->Push(lod_datas); VLOG(4) << "push one data, queue_size=" << queue->Size(); } +} + +void ReadThread(const std::vector& file_list, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, + std::shared_ptr queue) { + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; + for (auto& file : file_list) { + VLOG(3) << "[" << thread_id << "]" + << " file " << file; + } + (*thread_status)[thread_id] = Running; + VLOG(3) << "set status to running"; + + std::shared_ptr reader; + if (data_desc.file_type_ == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (data_desc.file_type_ == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", data_desc.file_type_); + } + + VLOG(3) << "reader inited"; + + if (data_desc.file_format_ == "svm") { + ReadSvmData(data_desc, reader, queue); + } (*thread_status)[thread_id] = Stopped; VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index a14e21bc8d2..dfdaae3a04b 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -139,7 +139,7 @@ TEST(CTR_READER, read_data) { file_list.push_back(gz_file_name); } - DataDesc data_desc(batch_size, file_list, "gzip", "plain", {}, {}, + DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {}, sparse_slots); CTRReader reader(queue, thread_num, data_desc); -- GitLab From c583fd34acc9e02362fd2ddd4bf7adb53d8321e6 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 3 Dec 2018 09:53:24 +0800 Subject: [PATCH 0025/2367] add downpour sgd wrapper for pslib --- python/paddle/fluid/distributed/downpour.py | 34 ++++++++++++ python/paddle/fluid/distributed/node.py | 61 +++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 python/paddle/fluid/distributed/downpour.py create mode 100644 python/paddle/fluid/distributed/node.py diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py new file mode 100644 index 00000000000..523f6866687 --- /dev/null +++ b/python/paddle/fluid/distributed/downpour.py @@ -0,0 +1,34 @@ +import paddle.fluid as fluid +import pslib_pb2 as pslib +from .node import DownpourServer +from .node import DownpourWorker +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table + +class DownpourSGD(object): + def __init__(self, optimizer=opt, learning_rate=0.001, window=1): + # todo(guru4elephant): if optimizer is not None, will warning here + self.learning_rate_ = opt.learning_rate + self.window_ = window + + def minimize(self, loss, startup_program=None, + parameter_list=None, no_grad_set=None, + prefetch_slots=None, prefetch_slots_emb=None): + params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) + table_name = fluid_distributed_lookup_table(loss.block.program) + server = DownpourServer() + worker = DownpourWorker() + server.add_sparse_table(0, learning_rate, + prefetch_slots, prefetch_slots_emb) + server.add_dense_table(1, learning_rate, params, grads) + worker.add_sparse_table(0, learning_rate, + prefetch_slots, prefetch_slots_emb) + worker.add_dense_table(1, learning_rate, params, grads) + + ps_param = pslib.PSParameter() + ps_param.server_param.CopyFrom(server.get_desc()) + ps_param.worker_param.CopyFrom(worker.get_desc()) + worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + + return [solver_desc, parallel_desc] + + diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py new file mode 100644 index 00000000000..fc62d7220ca --- /dev/null +++ b/python/paddle/fluid/distributed/node.py @@ -0,0 +1,61 @@ +import paddle.fluid as fluid +import pslib_pb2 as pslib + +class Server(object): + def __init__(self): + pass + + +class Worker(object): + def __init__(self): + pass + + +class DownpourServer(Server): + def __init__(self): + self.server_ = pslib.ServerParameter().downpour_server_param + + def add_sparse_table(self, table_id, learning_rate, + slot_key, slot_value_var, slot_grad_var): + table = self.server_.downpour_table_param.add() + table.table_id = table_id + table.type = PS_SPARSE_TABLE + table.accessor.accessor_class = "DownpourFeatureValueAccessor" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.fea_dim = slot_value_var[0].shape[1] + + def add_dense_table(self, table_id, learning_rate, + param_var, grad_var): + table = self.server_.downpour_table_param.add() + table.table_id = table_id + table.type = PS_DENSE_TABLE + table.accessor.accessor_class = "DownpourDenseValueAccessor" + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + + def get_desc(self): + return self.server_ + + +class DownpourWorker(Worker): + def __init__(self, window): + self.window = window + self.worker_ = pslib.WorkerParameter().downpour_worker_param + self.worker_.pull_dense_per_batch = window + self.worker_.push_dense_per_batch = window + + def add_sparse_table(self, table_id, + slot_keys, slot_value_vars, slot_grad_vars): + table = self.worker_.sparse_table.add() + table.table_id = table_id + table.slot.extend(slot_keys) + self.worker_.extend([grad.name for grad in slot_grad_vars]) + + def add_dense_table(self, table_id, param_vars, grad_vars): + table = self.worker_.dense_table.add() + table.table_id = table_id + table.dense_variable_name.extend([p.name for p in param_vars]) + table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) + + def get_desc(self): + return self.worker_ -- GitLab From 9f53aad13ad840a2b49546bb5832eb74ee268687 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 3 Dec 2018 10:16:40 +0800 Subject: [PATCH 0026/2367] add test for read csv data --- paddle/fluid/operators/reader/ctr_reader.cc | 144 ++++++++++++++++-- .../fluid/operators/reader/ctr_reader_test.cc | 68 +++++++++ 2 files changed, 196 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 9834d7183a3..3595d771b40 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -76,22 +76,6 @@ static inline void parse_line( // label slot1:fea_sign slot2:fea_sign slot1:fea_sign static inline void parse_svm_line(const std::string& line) {} -// label,dense_fea,dense_fea,sparse_fea,sparse_fea -static inline void parse_csv_line(const std::string& line, - const DataDesc& data_desc, int64_t* label, - std::vector* dense_datas, - std::vector* sparse_datas) { - std::vector ret; - string_split(line, ',', &ret); - *label = std::stol(ret[2]) > 0; - for (auto& idx : data_desc.dense_slot_index_) { - dense_datas->push_back(std::stof(ret[idx])); - } - for (auto& idx : data_desc.sparse_slot_index_) { - sparse_datas->push_back(std::stol(ret[idx])); - } -} - class Reader { public: virtual ~Reader() {} @@ -250,6 +234,132 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, } } +// label dense_fea,dense_fea sparse_fea,sparse_fea +static inline void parse_csv_line( + const std::string& line, const DataDesc& data_desc, int64_t* label, + std::vector>* dense_datas, + std::vector>* sparse_datas) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stol(ret[0]); + dense_datas->resize(data_desc.dense_slot_index_.size()); + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + int slot_idx = data_desc.dense_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*dense_datas)[i].push_back(std::stof(data_str)); + } + } + sparse_datas->resize(data_desc.sparse_slot_index_.size()); + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + int slot_idx = data_desc.sparse_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*sparse_datas)[i].push_back(std::stol(data_str)); + } + } +} + +void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { + std::string line; + while (reader->HasNext()) { + std::vector batch_label; + batch_label.reserve(data_desc.batch_size_); + + std::vector>> batch_dense_data; + batch_dense_data.reserve(data_desc.batch_size_); + + std::vector>> batch_sparse_data; + batch_sparse_data.reserve(data_desc.batch_size_); + + // read batch_size data + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); + int64_t label; + std::vector> dense_datas; + std::vector> sparse_datas; + parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas); + batch_label.push_back(label); + if (!batch_dense_data.empty()) { + PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(), + "dense data should have the same shape"); + } + batch_dense_data.push_back(dense_datas); + batch_sparse_data.push_back(sparse_datas); + } else { + break; + } + } + + // the order of output data is label, dense_datas, sparse_datas + std::vector lod_datas; + + // insert label tensor + framework::LoDTensor label_tensor; + auto* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({static_cast(batch_label.size()), 1}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); + auto dim = + framework::make_ddim({static_cast(batch_label.size()), 1}); + lod_datas.push_back(label_tensor); + + // insert tensor for each dense_slots + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + framework::LoDTensor lod_tensor; + size_t width = batch_dense_data[0][i].size(); + auto* tensor_data = lod_tensor.mutable_data( + framework::make_ddim( + {static_cast(batch_dense_data.size()), // batch_size + static_cast(width)}), + platform::CPUPlace()); + + for (size_t j = 0; j < batch_dense_data.size(); ++j) { + auto& dense_data_row = batch_dense_data[j][i]; + memcpy(tensor_data + j * width, dense_data_row.data(), + width * sizeof(float)); + } + + lod_datas.push_back(lod_tensor); + } + + // insert tensor for each sparse_slots + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + std::vector lod_data{0}; + std::vector batch_feasign; + + for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) { + auto& sparse_ids = batch_sparse_data[row_idx][i]; + lod_data.push_back(lod_data.back() + sparse_ids.size()); + batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(), + sparse_ids.end()); + } + + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({static_cast(batch_feasign.size()), 1}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); + lod_datas.push_back(lod_tensor); + } + + queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + void ReadThread(const std::vector& file_list, const DataDesc& data_desc, int thread_id, std::vector* thread_status, @@ -276,6 +386,8 @@ void ReadThread(const std::vector& file_list, if (data_desc.file_format_ == "svm") { ReadSvmData(data_desc, reader, queue); + } else if (data_desc.file_format_ == "csv") { + ReadCsvData(data_desc, reader, queue); } (*thread_status)[thread_id] = Stopped; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index dfdaae3a04b..9f3a254c84d 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -159,3 +159,71 @@ TEST(CTR_READER, read_data) { &reader); reader.Shutdown(); } + +static void GenereteCsvData(const std::string& file_name, + const std::vector& data) { + std::ofstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} + +static void CheckReadCsvOut(const std::vector& out) { + ASSERT_EQ(out.size(), 3); + ASSERT_EQ(out[0].dims()[1], 1); + ASSERT_EQ(out[1].dims()[1], 2); + ASSERT_EQ(out[2].dims()[1], 1); + for (size_t i = 0; i < out[0].numel(); ++i) { + int64_t label = out[0].data()[i]; + auto& dense_dim = out[1].dims(); + for (size_t j = 0; j < dense_dim[1]; ++j) { + ASSERT_EQ(out[1].data()[i * dense_dim[1] + j], + static_cast(label + 0.1)); + } + auto& sparse_lod = out[2].lod(); + for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) { + ASSERT_EQ(out[2].data()[j], label); + } + } +} + +TEST(CTR_READER, read_csv_data) { + std::string file_name = "test_ctr_reader_data.csv"; + const std::vector csv_data = { + "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n", + "3 3.1,3.1 3,3,3,3\n", + }; + GenereteCsvData(file_name, csv_data); + + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 3; + int thread_num = 1; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(file_name); + } + DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {}); + + CTRReader reader(queue, thread_num, data_desc); + + for (size_t i = 0; i < 2; ++i) { + reader.Start(); + std::vector out; + while (true) { + reader.ReadNext(&out); + if (out.empty()) { + break; + } + CheckReadCsvOut(out); + } + reader.Shutdown(); + } +} -- GitLab From a77fa67bbd11131f0c8c3683b903b2ceeeca41a0 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 3 Dec 2018 17:44:18 +0800 Subject: [PATCH 0027/2367] async_thread_trainer & libmct & pslib.cmake --- cmake/external/libmct.cmake | 17 +- cmake/external/pslib.cmake | 2 +- paddle/fluid/framework/async_executor.cc | 83 +++- paddle/fluid/framework/async_executor.h | 41 +- .../fluid/framework/executor_thread_worker.cc | 456 ++++++++++++++++++ .../fluid/framework/executor_thread_worker.h | 150 +++++- paddle/fluid/pybind/async_executor_py.cc | 6 +- python/paddle/fluid/async_executor.py | 13 + 8 files changed, 745 insertions(+), 23 deletions(-) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 351806f6e1a..239183cb6d0 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -40,9 +40,6 @@ SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) -SET(LIBMCT_LIB_DIR ${LIBMCT_ROOT}/lib) -SET(LIBMCT_LIB ${LIBMCT_LIB_DIR}/libps.so) -SET(LIBMCT_IOMP_LIB ${LIBMCT_LIB_DIR}/libiomp5.so) #todo what is this SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) @@ -66,11 +63,15 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} ) -ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET libmct PROPERTY IMPORTED_LOCATION ${LIBMCT_LIB}) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(libmct STATIC ${dummyfile}) +else() + add_library(libmct INTERFACE) +endif() + +#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) LIST(APPEND external_project_dependencies libmct) -IF(WITH_C_API) - INSTALL(FILES ${LIBMCT_LIB} ${LIBMCT_IOMP_LIB} DESTINATION lib) -ENDIF() diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 812af5efa20..586f66d6fdb 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -66,7 +66,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} ) -ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +ADD_LIBRARY(pslib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) LIST(APPEND external_project_dependencies pslib) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index aa76e03e838..94ed8c2fca4 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -48,6 +48,10 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); + worker->SetPSlibPtr(_pslib_ptr); + worker->SetPullDenseThread(_pull_dense_thread); + worker->BindingSlotVariableMemory(); + worker->SetParamConfig(&_param_config); } void PrepareReaders(std::vector>& readers, // NOLINT @@ -61,6 +65,77 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { + _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); + _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO +} + +void AsyncExecutor::StartServer() { + _pslib_ptr->run_server(); +} + +void AsyncExecutor::InitModel() { + //TODO only rank = 0 do this + std::vector all_dense_table_id; //TODO + all_dense_table_id.push_back(0); + for (auto table_id: all_dense_table_id) { + std::vector regions; + std::vector variables; //TODO + for (auto& t : variables) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } +} + +void AsyncExecutor::SaveModel(const std::string& path) { + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // TODO should be feasign_cnt < 0, because server bug + LOG(FATAL) << "save model failed"; + exit(-1); + } +} + +void AsyncExecutor::PrepareDenseThread() { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + + _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + +} + void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, const std::vector& filelist, @@ -83,7 +158,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, &data_feed_desc); - int actual_thread_num = thread_num; + actual_thread_num = thread_num; int file_cnt = filelist.size(); PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); @@ -107,11 +182,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - + PrepareDenseThread(); std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { - worker.reset(new ExecutorThreadWorker); + worker.reset(new AsyncExecutorThreadWorker); } // prepare thread resource here @@ -129,7 +204,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - + _pull_dense_thread->stop(); root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 6aa59c89dc4..67f4e5deeee 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -22,6 +22,8 @@ limitations under the License. */ #include // NOLINT #include #include +#include //local_random_engine +#include //local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -30,6 +32,26 @@ limitations under the License. */ namespace paddle { namespace framework { + +inline double current_realtime() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + class AsyncExecutor { public: AsyncExecutor(Scope* scope, const platform::Place& place); @@ -40,9 +62,12 @@ class AsyncExecutor { const int thread_num, const std::vector& fetch_names, const bool debug = false); - void ConfigServer() {} - void ConfigWorker() {} - void StartServer() {} + //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); + void ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); + //void ConfigWorker() {} + void StartServer(); + void InitModel(); + void SaveModel(const std::string& path); private: void CreateThreads(ExecutorThreadWorker* worker, @@ -51,11 +76,19 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); - + void PrepareDenseThread(); public: + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; Scope* root_scope_; platform::Place place_; + + AsyncWorkerParamConfig _param_config; + private: + int actual_thread_num; }; + + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 4e4001e979f..19d8818be74 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -31,6 +31,85 @@ limitations under the License. */ namespace paddle { namespace framework { +int DensePullThread::start() { + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; +} + +void DensePullThread::run() { + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); + } + + usleep(_sleep_time_ms * 1000); + } +} +bool DensePullThread::check_update_param(uint64_t table_id) { + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; +} + +void DensePullThread::reset_thread_version(uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; +} +std::future DensePullThread::pull_dense(uint64_t table_id) { + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); +} + +void DensePullThread::wait_all() { + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + } + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); +} + +void DensePullThread::increase_thread_version(int thread_id, uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; +} + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -90,6 +169,11 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } +void ExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { + +} + + void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ return; @@ -219,5 +303,377 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +//AsyncExecutor +void AsyncExecutorThreadWorker::TrainFiles() { + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + TrainOneNetwork(); + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { + _pslib_ptr = pslib_ptr; +} +void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptr dpt) { + _pull_dense_thread = dpt; +} +void AsyncExecutorThreadWorker::TrainOneNetwork() { + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + op->Run(*thread_scope_, place_); + } + + UpdateParams(); +} + +void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { + /* + std::vector ins_slot_offset(batch_size + 1, 0); + for (auto i = 1u; i <= batch_size; ++i) { + ins_slot_offset[i] += ins_slot_offset[i - 1] + slot_dim; + } + + std::vector tensor_lod(batch_size + 1, 0); + for (auto i = 1u; i <= batch_size; ++i) { + tensor_lod[i] += tensor_lod[i - 1] + 1; + } + + auto& used_slots = reader->get_use_slot_alias(); + slot_input_vec.resize(used_slots.size() - 1); + for (auto slot_idx = 1u; slot_idx < used_slots.size(); ++slot_idx) { + auto var = slot_input_variable_name[slot_idx]; + + auto v = thread_scope->FindVar(var); + CHECK(v != nullptr) << "var[" << var << "] not found"; + + LoDTensor* tensor = v->GetMutable(); + float* tensor_ptr = tensor->mutable_data({batch_size, slot_dim}, platform::CPUPlace()); + memset(tensor_ptr, 0, sizeof(float) * ins_slot_offset.back()); + + LoD data_lod{tensor_lod}; + tensor->set_lod(data_lod); + + slot_input_vec[slot_idx - 1].reset(tensor); + } + */ +} +void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* pc) { + _param_config = pc; +} + +void AsyncExecutorThreadWorker::PrepareParams() { + int table_id = 0; //TODO + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } + } + _pull_sparse_status.resize(0); + + FillSparse(table_id); +} + +void AsyncExecutorThreadWorker::UpdateParams() { + //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO + for (int i = 0; i < 1; ++i) { + PushSparse(i); + } + //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO + for (int i = 1; i < 2; ++i) { + PushDense(i); + } + int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO + int32_t tmp_push_sparse_wait_times = _param_config->tmp_push_sparse_wait_times; //TODO + static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); + } + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + + //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO + int dense_table_id = 1; + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + //} +} + +void AsyncExecutorThreadWorker::PushDense(int table_id) { + //auto table_id = GlobalConfig::instance().dense_table_id[table_id_index]; TODO + + std::vector regions; + //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; + std::vector variables; + for (auto& t : variables) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); + +} + +void AsyncExecutorThreadWorker::PullSparse(int table_id) { + + + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; //TODO + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + //todo: current trick - filter feasign=use_slot_mod(bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); + } + } + + check_pull_push_memory(features, feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + //to save time + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + + //binding_slot_embed_with_concat(); TODO + collect_feasign_info(table_id); //TODO +} + +void AsyncExecutorThreadWorker::FillSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; // TODO + auto fea_dim = _param_config->fea_dim; //TODO + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + + Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->data(); + + for (auto index = 0u; index < len; ++index){ + //if (_current_train_job.use_cvm_feature()) { + // if (ids[index] == 0u) { + // memcpy(ptr + slot_dim * index, init_value.data(), sizeof(float) * slot_dim); + // continue; + // } + // memcpy(ptr + slot_dim * index, fea_value[fea_idx].data(), sizeof(float) * slot_dim); + // (ptr + slot_dim * index)[0] = log((ptr + slot_dim * index)[0] + 1); + // (ptr + slot_dim * index)[1] = log((ptr + slot_dim * index)[1] + 1) - (ptr + slot_dim * index)[0]; + // fea_idx++; + //} else { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; + //} + } + } +} + +void AsyncExecutorThreadWorker::PushSparse(int table_id) { + + auto slot_dim = _param_config->slot_dim; //TODO + auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO + auto& features = _features[table_id]; + //std::vector gradient_var; + //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; //TODO + int offset = 0; + //if (!_current_train_job.use_cvm_feature()) { //TODO + offset = 2; + //} + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + continue; + } + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[slot_idx - 1]); + LoDTensor* g_tensor = g_var->GetMutable(); + //int count = g_tensor->numel(); + float* g = g_tensor->data(); + /* + if (FLAGS_scale_sparse_gradient_with_batch_size) { + Eigen::Map g_mat(g, 1, tensor->numel()); + g_mat *= _batch_size; + } + */ + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int len = tensor->lod()[0].back(); + //assert(slot_dim * len == count); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx){ + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; + } + } + assert(fea_idx == features.size()); + CHECK(features.size() > 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), (const float**)push_g_vec.data(), features.size()); + _push_sparse_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::collect_feasign_info( + int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; + } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } + } + } + CHECK(global_index == feature.size()) << "expect fea info size:" << feature.size() + << " real:" << global_index; +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + std::vector& features, + std::vector>& push_g, + int dim) { + push_g.resize(features.size() + 1); + for (auto& t : push_g) { + t.resize(dim); + } +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + std::vector& features, + std::vector& push_g, + int dim) { + if (features.size() > push_g.size()) { + push_g.reserve(features.size() + 1); + auto size = features.size() - push_g.size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g.push_back(ptr); + } + } +} + } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 13ec2442c46..63f383cd479 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,16 +25,107 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "pslib.h" namespace paddle { namespace framework { + +const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; + void CreateTensor(Variable* var, proto::VarType::Type var_type); +struct AsyncWorkerParamConfig { + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector slot_input_vec; //6048slot 6050slot //name + std::vector gradient_var; //6048slot_embed +}; + +struct DensePullThreadParam { + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; +}; + +class DensePullThread { +public: + DensePullThread(DensePullThreadParam& param) : + _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert( + _dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; + } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); + } + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + +private: + void run(); + bool check_update_param(uint64_t table_id); + +private: + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; +}; class ExecutorThreadWorker { public: ExecutorThreadWorker() : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} - ~ExecutorThreadWorker() {} + virtual ~ExecutorThreadWorker() {} void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); @@ -51,10 +142,13 @@ class ExecutorThreadWorker { // set data feed declared in executor void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function - void TrainFiles(); + virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); - + virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); + virtual void SetPullDenseThread(std::shared_ptr dpt) {}; + virtual void BindingSlotVariableMemory() {}; + virtual void SetParamConfig(AsyncWorkerParamConfig* pc) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -77,12 +171,58 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - - private: + //private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; +class AsyncExecutorThreadWorker: public ExecutorThreadWorker { +public: + AsyncExecutorThreadWorker(){}; + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void BindingSlotVariableMemory(); + void SetParamConfig(AsyncWorkerParamConfig* pc); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory(std::vector& features, std::vector& push_g, int dim); + void check_pull_push_memory(std::vector& features, std::vector>& push_g, int dim); + void collect_feasign_info(int table_id); +private: + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + std::unordered_map _slot_alias_to_table; //TODO + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; + +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 470e8b05080..63fd06224f0 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -47,7 +47,11 @@ void BindAsyncExecutor(py::module* m) { return std::unique_ptr( new framework::AsyncExecutor(scope, place)); })) - .def("run_from_files", &framework::AsyncExecutor::RunFromFile); + .def("run_from_files", &framework::AsyncExecutor::RunFromFile) + .def("config_pslib", &framework::AsyncExecutor::ConfigPslib) + .def("start_server", &framework::AsyncExecutor::StartServer) + .def("init_model", &framework::AsyncExecutor::InitModel) + .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor } // end namespace pybind } // end namespace paddle diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2664a7301db..2945e6e1436 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -149,3 +149,16 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + + def config_ps(self, dist_desc, host_sign_list, node_num, index): + self.executor.config_pslib(dist_desc, host_sign_list, node_num, index) + + def start_server(self): + self.executor.start_server() + + def init_model(self): + self.executor.init_model() + + def save_model(self, save_path): + self.executor.save_model(save_path) + -- GitLab From e650b42914eca57c8d5a9f743e10788d9cc39828 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 3 Dec 2018 17:47:13 +0800 Subject: [PATCH 0028/2367] async_thread_trainer & libmct & pslib.cmake --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c929396fff..6fd8dd1dfaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,7 +217,7 @@ include(cupti) include(external/gzstream) endif (NOT WIN32) include(external/libmct) -include(external/pslib_brpc) +#include(external/pslib_brpc) include(external/pslib) if(WITH_DISTRIBUTE) @@ -280,6 +280,8 @@ set(EXTERNAL_LIBS zlib ${PYTHON_LIBRARIES} pslib + #pslib_brpc + libmct ) if(WITH_AMD_GPU) -- GitLab From ee4c51a372be97076f06cc7c61f624c3b65b501e Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 3 Dec 2018 18:00:21 +0800 Subject: [PATCH 0029/2367] refine downpour sgd API with pslib --- python/paddle/fluid/distributed/__init__.py | 0 python/paddle/fluid/distributed/downpour.py | 30 +- python/paddle/fluid/distributed/node.py | 30 +- python/paddle/fluid/distributed/ps_pb2.py | 1491 +++++++++++++++++++ 4 files changed, 1526 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/distributed/__init__.py create mode 100644 python/paddle/fluid/distributed/ps_pb2.py diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 523f6866687..551a4714950 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -1,34 +1,32 @@ -import paddle.fluid as fluid -import pslib_pb2 as pslib from .node import DownpourServer from .node import DownpourWorker +from ..backward import append_backward +import ps_pb2 as pslib from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from google.protobuf import text_format class DownpourSGD(object): - def __init__(self, optimizer=opt, learning_rate=0.001, window=1): + def __init__(self, learning_rate=0.001, window=1): # todo(guru4elephant): if optimizer is not None, will warning here - self.learning_rate_ = opt.learning_rate + self.learning_rate_ = learning_rate self.window_ = window - def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None, + def minimize(self, loss, startup_program=None, + parameter_list=None, no_grad_set=None, prefetch_slots=None, prefetch_slots_emb=None): params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) - table_name = fluid_distributed_lookup_table(loss.block.program) + table_name = find_distributed_lookup_table(loss.block.program) server = DownpourServer() - worker = DownpourWorker() - server.add_sparse_table(0, learning_rate, + worker = DownpourWorker(self.window_) + server.add_sparse_table(0, learning_rate, prefetch_slots, prefetch_slots_emb) server.add_dense_table(1, learning_rate, params, grads) - worker.add_sparse_table(0, learning_rate, + worker.add_sparse_table(0, learning_rate, prefetch_slots, prefetch_slots_emb) worker.add_dense_table(1, learning_rate, params, grads) - ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - ps_param.worker_param.CopyFrom(worker.get_desc()) + #ps_param.worker_param.CopyFrom(worker.get_desc()) worker_skipped_ops = ["lookup_table", "lookup_table_grad"] - - return [solver_desc, parallel_desc] - - + ps_param_str = text_format.MessageToString(ps_param) + return [ps_param_str, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index fc62d7220ca..3344bba137e 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -1,5 +1,4 @@ -import paddle.fluid as fluid -import pslib_pb2 as pslib +import ps_pb2 as pslib class Server(object): def __init__(self): @@ -13,11 +12,13 @@ class Worker(object): class DownpourServer(Server): def __init__(self): - self.server_ = pslib.ServerParameter().downpour_server_param + #self.server_ = pslib.ServerParameter().downpour_server_param + self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, slot_key, slot_value_var, slot_grad_var): - table = self.server_.downpour_table_param.add() + #table = self.server_.downpour_table_param.add() + table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.type = PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" @@ -26,12 +27,14 @@ class DownpourServer(Server): def add_dense_table(self, table_id, learning_rate, param_var, grad_var): - table = self.server_.downpour_table_param.add() + #table = self.server_.downpour_table_param.add() + table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.type = PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" table.accessor.sparse_sgd_param.learning_rate = learning_rate - table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + table.accessor.fea_dim = 1 + #table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) def get_desc(self): return self.server_ @@ -40,19 +43,28 @@ class DownpourServer(Server): class DownpourWorker(Worker): def __init__(self, window): self.window = window - self.worker_ = pslib.WorkerParameter().downpour_worker_param + #self.worker_ = pslib.WorkerParameter().downpour_worker_param + #self.worker_ = pslib.WorkerParameter() + self.worker_ = pslib.DownpourTrainerParameter() + #self.worker_.pull_dense_per_batch = window + #self.worker_.push_dense_per_batch = window + #self.worker_.downpour_worker_param.pull_dense_per_batch = window + #self.worker_.downpour_worker_param.push_dense_per_batch = window self.worker_.pull_dense_per_batch = window self.worker_.push_dense_per_batch = window + print(self.worker_) def add_sparse_table(self, table_id, slot_keys, slot_value_vars, slot_grad_vars): - table = self.worker_.sparse_table.add() + #table = self.worker_.sparse_table.add() + table = self.worker_.downpour_worker_param.sparse_table.add() table.table_id = table_id table.slot.extend(slot_keys) self.worker_.extend([grad.name for grad in slot_grad_vars]) def add_dense_table(self, table_id, param_vars, grad_vars): - table = self.worker_.dense_table.add() + #table = self.worker_.dense_table.add() + table = self.worker_.downpour_worker_param.dense_table.add() table.table_id = table_id table.dense_variable_name.extend([p.name for p in param_vars]) table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py new file mode 100644 index 00000000000..355841aba8f --- /dev/null +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -0,0 +1,1491 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: ps.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xe4\x01\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x02 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x03 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x04 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x05 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +_TABLETYPE = _descriptor.EnumDescriptor( + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3140, + serialized_end=3192, +) +_sym_db.RegisterEnumDescriptor(_TABLETYPE) + +TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) +_PSCMDID = _descriptor.EnumDescriptor( + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', index=2, number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', index=3, number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', index=5, number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', index=6, number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', index=7, number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', index=8, number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', index=9, number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', index=10, number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', index=11, number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3195, + serialized_end=3512, +) +_sym_db.RegisterEnumDescriptor(_PSCMDID) + +PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) +PS_SPARSE_TABLE = 0 +PS_DENSE_TABLE = 1 +PS_PULL_DENSE_TABLE = 0 +PS_PUSH_DENSE_TABLE = 1 +PS_PULL_SPARSE_TABLE = 2 +PS_PUSH_SPARSE_TABLE = 3 +PS_SHRINK_TABLE = 4 +PS_SAVE_ONE_TABLE = 5 +PS_SAVE_ALL_TABLE = 6 +PS_LOAD_ONE_TABLE = 7 +PS_LOAD_ALL_TABLE = 8 +PS_CLEAR_ONE_TABLE = 9 +PS_CLEAR_ALL_TABLE = 10 +PS_PUSH_DENSE_PARAM = 11 +PS_STOP_SERVER = 12 + + +_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3108, + serialized_end=3138, +) +_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) + + +_PSPARAMETER = _descriptor.Descriptor( + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', full_name='paddle.PSParameter.worker_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', full_name='paddle.PSParameter.server_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', full_name='paddle.PSParameter.instance_class', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', full_name='paddle.PSParameter.worker_param', index=3, + number=101, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', full_name='paddle.PSParameter.server_param', index=4, + number=102, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=5, + number=501, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=21, + serialized_end=249, +) + + +_WORKERPARAMETER = _descriptor.Descriptor( + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=251, + serialized_end=332, +) + + +_SERVERPARAMETER = _descriptor.Descriptor( + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=334, + serialized_end=415, +) + + +_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=417, + serialized_end=496, +) + + +_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=499, + serialized_end=687, +) + + +_DENSETABLEPARAMETER = _descriptor.Descriptor( + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1, + number=2, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=689, + serialized_end=812, +) + + +_SPARSETABLEPARAMETER = _descriptor.Descriptor( + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=814, + serialized_end=936, +) + + +_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=939, + serialized_end=1073, +) + + +_SERVERSERVICEPARAMETER = _descriptor.Descriptor( + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, + number=4, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1076, + serialized_end=1221, +) + + +_TABLEPARAMETER = _descriptor.Descriptor( + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.TableParameter.table_id', index=0, + number=1, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', full_name='paddle.TableParameter.table_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', full_name='paddle.TableParameter.shared_num', index=2, + number=3, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', full_name='paddle.TableParameter.accessor', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', full_name='paddle.TableParameter.type', index=4, + number=5, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1224, + serialized_end=1415, +) + + +_TABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3, + number=4, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5, + number=6, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6, + number=7, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7, + number=8, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1418, + serialized_end=1787, +) + + +_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0, + number=1, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1, + number=2, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2, + number=3, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4, + number=5, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5, + number=6, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6, + number=7, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1790, + serialized_end=1996, +) + + +_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0, + number=1, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1998, + serialized_end=2081, +) + + +_PSREQUESTMESSAGE = _descriptor.Descriptor( + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0, + number=1, type=13, cpp_type=3, label=2, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1, + number=2, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', full_name='paddle.PsRequestMessage.params', index=2, + number=3, type=12, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', full_name='paddle.PsRequestMessage.data', index=4, + number=5, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2083, + serialized_end=2184, +) + + +_SPARSESGDRULEPARAMETER = _descriptor.Descriptor( + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2, + number=3, type=1, cpp_type=5, label=1, + has_default_value=True, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3, + number=4, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2186, + serialized_end=2305, +) + + +_DENSESGDRULEPARAMETER = _descriptor.Descriptor( + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2308, + serialized_end=2533, +) + + +_ADAMSGDPARAMETER = _descriptor.Descriptor( + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2, + number=3, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3, + number=4, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4, + number=5, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2536, + serialized_end=2670, +) + + +_NAIVESGDPARAMETER = _descriptor.Descriptor( + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2672, + serialized_end=2738, +) + + +_SUMMARYSGDPARAMETER = _descriptor.Descriptor( + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=True, default_value=float(0.999999), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2740, + serialized_end=2799, +) + + +_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2801, + serialized_end=2847, +) + + +_PSRESPONSEMESSAGE = _descriptor.Descriptor( + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0, + number=1, type=5, cpp_type=1, label=2, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1, + number=2, type=9, cpp_type=9, label=2, + has_default_value=True, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', full_name='paddle.PsResponseMessage.data', index=2, + number=3, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2849, + serialized_end=2922, +) + + +_FSCLIENTPARAMETER = _descriptor.Descriptor( + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0, + number=1, type=14, cpp_type=8, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', full_name='paddle.FsClientParameter.uri', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', full_name='paddle.FsClientParameter.user', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', full_name='paddle.FsClientParameter.passwd', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5, + number=51, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6, + number=101, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _FSCLIENTPARAMETER_FSAPITYPE, + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2925, + serialized_end=3138, +) + +_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER +_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER +_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER +_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE +_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER +DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER +DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER +DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER +DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER +DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE +DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER +DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER +DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER +DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE +DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER +DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE +DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID + +PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict( + DESCRIPTOR = _PSPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) +_sym_db.RegisterMessage(PSParameter) + +WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict( + DESCRIPTOR = _WORKERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) +_sym_db.RegisterMessage(WorkerParameter) + +ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict( + DESCRIPTOR = _SERVERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) +_sym_db.RegisterMessage(ServerParameter) + +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURWORKERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) +_sym_db.RegisterMessage(DownpourWorkerParameter) + +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURTRAINERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) +_sym_db.RegisterMessage(DownpourTrainerParameter) + +DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict( + DESCRIPTOR = _DENSETABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) +_sym_db.RegisterMessage(DenseTableParameter) + +SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict( + DESCRIPTOR = _SPARSETABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) +_sym_db.RegisterMessage(SparseTableParameter) + +DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURSERVERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) +_sym_db.RegisterMessage(DownpourServerParameter) + +ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict( + DESCRIPTOR = _SERVERSERVICEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) +_sym_db.RegisterMessage(ServerServiceParameter) + +TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) +_sym_db.RegisterMessage(TableParameter) + +TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEACCESSORPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) +_sym_db.RegisterMessage(TableAccessorParameter) + +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) +_sym_db.RegisterMessage(DownpourTableAccessorParameter) + +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) +_sym_db.RegisterMessage(TableAccessorSaveParameter) + +PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict( + DESCRIPTOR = _PSREQUESTMESSAGE, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) +_sym_db.RegisterMessage(PsRequestMessage) + +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _SPARSESGDRULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) +_sym_db.RegisterMessage(SparseSGDRuleParameter) + +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _DENSESGDRULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) +_sym_db.RegisterMessage(DenseSGDRuleParameter) + +AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict( + DESCRIPTOR = _ADAMSGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) +_sym_db.RegisterMessage(AdamSGDParameter) + +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict( + DESCRIPTOR = _NAIVESGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) +_sym_db.RegisterMessage(NaiveSGDParameter) + +SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict( + DESCRIPTOR = _SUMMARYSGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) +_sym_db.RegisterMessage(SummarySGDParameter) + +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) +_sym_db.RegisterMessage(MovingAverageRuleParameter) + +PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict( + DESCRIPTOR = _PSRESPONSEMESSAGE, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) +_sym_db.RegisterMessage(PsResponseMessage) + +FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict( + DESCRIPTOR = _FSCLIENTPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) +_sym_db.RegisterMessage(FsClientParameter) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001')) +# @@protoc_insertion_point(module_scope) -- GitLab From c47c451a007f33078bfb8f38be4a6cd50922f361 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 11:45:53 +0000 Subject: [PATCH 0030/2367] fix bug --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/computation_op_handle.cc | 2 + .../details/eager_deletion_op_handle.cc | 23 ++-- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 81 ++++++------ .../fluid/framework/details/op_graph_view.h | 29 +++- .../framework/details/reference_count_pass.cc | 125 ++++++++++++++++-- .../scope_buffered_ssa_graph_executor.cc | 21 ++- .../scope_buffered_ssa_graph_executor.h | 2 + paddle/fluid/framework/executor.cc | 104 +++++++++++---- paddle/fluid/framework/executor.h | 51 ++----- paddle/fluid/framework/garbage_collector.h | 44 +++--- paddle/fluid/framework/operator.cc | 2 + paddle/fluid/framework/parallel_executor.cc | 13 +- paddle/fluid/framework/scope.cc | 6 + paddle/fluid/framework/scope.h | 1 + paddle/fluid/framework/tensor.h | 2 +- .../fluid/operators/controlflow/while_op.cc | 44 +++++- paddle/fluid/operators/reader/ctr_reader.h | 12 +- paddle/fluid/platform/device_context.h | 10 +- .../fluid/platform/stream_callback_manager.cc | 67 +++++----- .../fluid/platform/stream_callback_manager.h | 20 +-- paddle/fluid/pybind/tensor_py.h | 12 +- python/paddle/fluid/__init__.py | 5 +- 24 files changed, 458 insertions(+), 228 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8cf97d667d4..8049f5d3f77 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -35,7 +35,7 @@ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_e cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7beb8c8de9f..2bf43fd4e0f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,6 +31,8 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); + VLOG(10) << "Run Op" << Name(); + auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index cd262033760..41f616035d7 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/cuda_device_guard.h" namespace paddle { namespace framework { @@ -23,28 +24,32 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { + const std::unordered_set &var_names, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), + scope_(scope), + var_names_(var_names), + gc_(gc), + ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { dev_ctx_ = static_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast *>(gc_)) { - platform::SetDeviceId(boost::get(place).device); + platform::CUDADeviceGuard guard( + boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_); } } #endif - - for (auto &name : var_names) AddVar(name); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { #ifdef PADDLE_WITH_CUDA if (event_) { auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); + platform::CUDADeviceGuard guard(gpu_place.device); PADDLE_ENFORCE(cudaEventDestroy(event_)); } #endif @@ -52,10 +57,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } -void EagerDeletionOpHandle::AddVar(const std::string &name) { - var_names_.insert(name); -} - void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); std::vector tensors; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 8254f21bdfc..d8de59cc4de 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -25,13 +25,11 @@ class Scope; namespace details { -class EagerDeletionPass; - class EagerDeletionOpHandle : public OpHandleBase { public: EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, + const std::unordered_set &var_names, GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); @@ -45,8 +43,6 @@ class EagerDeletionOpHandle : public OpHandleBase { private: void ClearTensors(const std::vector &tensors); - void AddVar(const std::string &name); - const Scope *scope_; std::unordered_set var_names_; GarbageCollector *gc_; // not own @@ -55,8 +51,6 @@ class EagerDeletionOpHandle : public OpHandleBase { platform::CUDADeviceContext *dev_ctx_{nullptr}; cudaEvent_t event_{nullptr}; #endif - - friend class EagerDeletionPass; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index f877c2881cd..3a1b37e5339 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -26,62 +26,61 @@ namespace paddle { namespace framework { namespace details { -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } - - // Add leaf node to eager_deletion_node - if (out->Outputs().empty()) { - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - out->AddOutput(dummy_leaf); - } -} - std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); + const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kCurReferenceCount); - auto &last_live_ops = Get>(kLastLiveOpsOfVars); + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); auto &gcs = Get(kGarbageCollector); ref_cnts = std::vector(vars.size()); - std::unordered_map op_map; + std::unordered_map> + op_vars_map; + for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; - for (ComputationOpHandle *op : var_ops_pair.second) { - auto it = op_map.find(op); - if (it != op_map.end()) { - it->second->AddVar(var_name); - } else { - auto *eager_deletion_node = graph->CreateEmptyNode( - "eager_deletion", ir::Node::Type::kOperation); - auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, - gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); - AddDependencyBetween(op, eager_deletion_op, graph.get()); - op_map[op] = eager_deletion_op; - } + for (auto *op : var_ops_pair.second) { + op_vars_map[op].insert(var_name); } } } - VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + + for (auto &pair : op_vars_map) { + auto *op = pair.first; + auto &var_names = pair.second; + + auto *eager_deletion_node = + graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), + std::move(var_names), gcs[op->GetScopeIdx()].get(), + &(ref_cnts[op->GetScopeIdx()])); + + auto it = std::find_if( + op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != op->Outputs().end()) { + eager_deletion_op->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + op->AddOutput(dep_var); + eager_deletion_op->AddInput(dep_var); + } + + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + eager_deletion_op->AddOutput(dummy_leaf); + } + + VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; return graph; } diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h index afb3e8e5946..77aa02eba56 100644 --- a/paddle/fluid/framework/details/op_graph_view.h +++ b/paddle/fluid/framework/details/op_graph_view.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include #include @@ -34,6 +34,11 @@ class OpGraphView { bool HasOp(OpHandleBase *op) const; + // Use a visitor to visit all pending ops of op + // Stop when callback returns false + template + bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const; + private: void Build(const std::vector &ops); void EnforceHasOp(OpHandleBase *op) const; @@ -44,6 +49,28 @@ class OpGraphView { pending_ops_; }; +template +bool OpGraphView::VisitAllPendingOps(OpHandleBase *op, + Callback &&callback) const { + EnforceHasOp(op); + std::unordered_set visited; + std::queue q; + q.push(op); + do { + op = q.front(); + q.pop(); + for (auto &pending_op : pending_ops_.at(op)) { + if (visited.count(pending_op) == 0) { + visited.insert(pending_op); + if (!callback(pending_op)) { + return false; + } + } + } + } while (!q.empty()); + return true; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f094c7afa9f..2320d3926ad 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -14,11 +14,13 @@ #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/reference_count_pass.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -27,6 +29,89 @@ namespace paddle { namespace framework { namespace details { +struct OpConnectionDetector { + public: + enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; + + explicit OpConnectionDetector(const std::vector &all_ops) + : graph_(all_ops) {} + + template + std::unordered_set MaxNoDepOps( + const OpSet &op_set) { + using KeyType = typename OpSet::key_type; + static_assert( + std::is_base_of::type>::value, + "Key type of OpSet must be or derived of OpHandleBase"); + + std::vector ops(op_set.begin(), op_set.end()); + std::unordered_set ret; + auto rels = GetRelations(ops); + auto not_before = [](RelationShip r) { return r != kBefore; }; + for (size_t i = 0; i < rels.size(); ++i) { + if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) { + ret.insert(static_cast(ops[i])); + } + } + return ret; + } + + private: + std::vector> GetRelations( + const std::vector ops) { + std::unordered_map op_to_idx; + for (size_t i = 0; i < ops.size(); ++i) { + PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); + op_to_idx[ops[i]] = i; + } + + PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops"); + + std::vector> ret(ops.size()); + for (auto &e : ret) { + e.assign(ops.size(), kSame); + } + + size_t found_num = ops.size(); + size_t total_num = ops.size() * ops.size(); + auto visitor = [&](OpHandleBase *op, size_t i) { + auto it = op_to_idx.find(op); + if (it != op_to_idx.end()) { + size_t j = it->second; + if (ret[i][j] != kSame) { + ret[i][j] = kBefore; + ret[j][i] = kAfter; + found_num += 2; + if (found_num == total_num) { + return false; + } + } + } + return true; + }; + + for (size_t i = 0; i < ops.size(); ++i) { + auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); }; + if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) { + break; + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + for (size_t j = i + 1; j < ops.size(); ++j) { + if (ret[i][j] != kSame) continue; + ret[i][j] = kNoDeps; + ret[j][i] = kNoDeps; + } + } + + return ret; + } + + const OpGraphView graph_; +}; + static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -59,9 +144,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); + OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) continue; + if (name_var_pair.second.empty()) { + continue; + } + + const std::string &var_name = name_var_pair.first; auto *last_ver_var = name_var_pair.second.back(); VarDesc *var_desc = nullptr; @@ -83,30 +174,46 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( } std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) { + auto add_last_live_op = [&](OpHandleBase *op) -> bool { auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); if (compute_op) { last_live_op.insert(compute_op); + return true; + } else { + return false; } }; - const std::string &var_name = name_var_pair.first; + + bool can_delete = false; auto &pending_ops = last_ver_var->PendingOps(); if (pending_ops.empty()) { auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op) { - ref_cnts[i].emplace(var_name, 1); - add_last_live_op(generated_op); + if (generated_op && add_last_live_op(generated_op)) { + can_delete = true; } } else { - ref_cnts[i].emplace(var_name, pending_ops.size()); + can_delete = true; for (auto *pending_op : pending_ops) { - add_last_live_op(pending_op); + if (!add_last_live_op(pending_op)) { + can_delete = false; + break; + } } } - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (can_delete) { + size_t original_size = last_live_op.size(); + last_live_op = detector.MaxNoDepOps(last_live_op); + if (last_live_op.size() != original_size) { + VLOG(10) << "Shrink last living op number of " << var_name << " from " + << original_size << " to " << last_live_op.size(); + } + ref_cnts[i].emplace(var_name, last_live_op.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + } } } + return graph; } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index f1bf6542a30..0cc3ac8bfb3 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -36,6 +36,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( } } +void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { + if (gc_) { + for (auto &gc : *gc_) { + gc->Wait(); + gc->Reset(); + } + } +} + FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { @@ -74,19 +83,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (size_t i = 0; i < places_.size(); ++i) { - platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); - if (gc_) { - (*gc_)[i]->Wait(); - (*gc_)[i]->Reset(); - } + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); } + WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } + } else { + WaitAllGarbageCollectors(); } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index ce3061d6e61..4d52183a205 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -50,6 +50,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: + void WaitAllGarbageCollectors(); + size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 96132a2c182..02d1e4114ee 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -37,11 +37,49 @@ namespace { int kProgramId = -1; } // namespace +static std::unordered_map GetNonPersistableReferenceCounts( + const BlockDesc& block, const std::vector& skip_var_list) { + std::unordered_map ref_cnts; + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + if (skip_vars.count(name)) continue; + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS && + type != proto::VarType::LOD_TENSOR_ARRAY) { + continue; + } + + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; + } + } + } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); + } + return ref_cnts; +} + ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id) + const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCount(prog_, block_id_); + ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -template -static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, - GarbageCollector* gc, - RefCntMap* ref_cnts) { +static void DeleteUnusedTensors( + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + std::unordered_map* ref_cnts) { std::unordered_set erase_tensors; auto handler = [&](const VariableNameMap& name_map) { @@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if ((it->second)-- == 1) { + if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { VLOG(10) << "Erase tensor \'" << name << "\'"; @@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, } else if (var->IsType()) { erase_tensors.insert( var->GetMutable()->mutable_value()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + erase_tensors.insert(&t); + } } } } @@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } std::unique_ptr Executor::Prepare( - const ProgramDesc& program, int block_id) { + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars) { std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id)); + new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -364,16 +407,28 @@ std::unique_ptr Executor::Prepare( } std::vector> Executor::Prepare( - const ProgramDesc& program, const std::vector& block_ids) { + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars) { + PADDLE_ENFORCE( + skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), + "skip_ref_cnt_vars should be either empty or equals to block number %d", + block_ids.size()); std::vector> result; + size_t idx = 0; for (auto& bid : block_ids) { - auto* ctx = new ExecutorPrepareContext(program, bid); + ExecutorPrepareContext* ctx; + if (skip_ref_cnt_vars.empty()) { + ctx = new ExecutorPrepareContext(program, bid); + } else { + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } result.push_back(std::shared_ptr(ctx)); + ++idx; } return result; } @@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; - // WhileOp would set keep_kids to true, - // because WhileGradOp needs the scopes created in WhileOp. - // Perhaps, we should not perform eager deletion in WhileOp - // The scopes and variables created by WhileOp would be deleted - // in WhileGradOp. - if (max_memory_size >= 0 && !keep_kids) { + if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { - gc.reset(new DefaultStreamGarbageCollector( - boost::get(place_), max_memory_size)); - } else { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place_), max_memory_size)); + } else { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } + } else if (platform::is_cpu_place(place_)) { #endif gc.reset(new CPUGarbageCollector( boost::get(place_), max_memory_size)); @@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); - if (gc != nullptr) { + if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), &(ctx->cur_ref_cnts_)); } } - if (gc != nullptr) { - gc->Wait(); - } else { - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c27..f00d4314b65 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -28,42 +28,11 @@ namespace paddle { namespace framework { extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); -template -std::unordered_map GetNonPersistableReferenceCount( - const ProgramDesc& prog, size_t block_id) { - auto& block = prog.Block(block_id); - std::unordered_map ref_cnts; - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS) { - continue; - } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); + ~ExecutorPrepareContext(); void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } @@ -72,8 +41,8 @@ struct ExecutorPrepareContext { size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map ref_cnts_; + std::unordered_map cur_ref_cnts_; }; class Executor { @@ -109,10 +78,14 @@ class Executor { const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( - const ProgramDesc& program, int block_id); + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); static std::vector> Prepare( - const ProgramDesc& program, const std::vector& block_ids); + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars = + std::vector>()); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index cbe8f606efe..1382e0d4618 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -19,6 +19,9 @@ #include #include #include // NOLINT +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -36,6 +39,11 @@ class GarbageCollector { virtual ~GarbageCollector() {} + size_t NumOfGarbages() const { + std::lock_guard guard(mutex_); + return garbages_->size(); + } + void Reset() { std::lock_guard guard(mutex_); garbages_.reset(new std::deque()); @@ -49,7 +57,7 @@ class GarbageCollector { template void Add(const Container &objs, Callback &&callback) { - std::shared_ptr> clear_deque; + std::deque *clear_deque = nullptr; { std::lock_guard guard(mutex_); for (auto *obj : objs) { @@ -58,7 +66,7 @@ class GarbageCollector { } if (cur_memory_size_ >= max_memory_size_) { cur_memory_size_ = 0; - clear_deque = garbages_; + clear_deque = garbages_.release(); garbages_.reset(new std::deque()); } } @@ -67,6 +75,7 @@ class GarbageCollector { callback(); ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); + delete clear_deque; }); } } @@ -77,7 +86,7 @@ class GarbageCollector { virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::shared_ptr> garbages_; + std::unique_ptr> garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; size_t cur_memory_size_ = 0; @@ -96,6 +105,19 @@ class CPUGarbageCollector : public GarbageCollector { }; #ifdef PADDLE_WITH_CUDA +template +class UnsafeFastGPUGarbageCollector : public GarbageCollector { + public: + UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + + protected: + void ClearCallback(const std::function &callback) override { + callback(); + } +}; + template class DefaultStreamGarbageCollector : public GarbageCollector { public: @@ -109,7 +131,7 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - static_cast(this->dev_ctx_) + static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -126,31 +148,23 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } - void Wait() const override { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - std::lock_guard guard(this->mutex_); - callback_manager_->Wait(); - } + void Wait() const override { callback_manager_->Wait(); } cudaStream_t stream() const { return stream_; } protected: - // ClearCallback and Wait()/Reset() cannot be call in multiple threads - // But it is not important, because they would not be called in multiple - // threads - // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8bfdf389120..a5f714fc89a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -873,6 +873,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", + ipt_name, DebugString()); int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e71f93beefc..3d466e44a19 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -158,8 +158,13 @@ ParallelExecutor::ParallelExecutor( auto &place = member_->places_[i]; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); + if (IsFastEagerDeletionModeEnabled()) { + member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); + } else { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; } else if (platform::is_cpu_place(place)) { #endif @@ -181,8 +186,8 @@ ParallelExecutor::ParallelExecutor( &(member_->rt_ref_cnts_)); ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); - VLOG(10) << "ReferenceCountPass Applied"; graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; auto eager_deletion_pass = ir::PassRegistry::Instance().Get("eager_deletion_pass"); @@ -194,6 +199,8 @@ ParallelExecutor::ParallelExecutor( &last_live_ops_of_vars); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); } // Step 3. Create vars in each scope. Passes may also create new vars. diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc..cb3b6cdc3ee 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,6 +38,10 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57e..aded1f771ce 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -27,6 +27,7 @@ namespace paddle { namespace framework { int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); class Scope; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b6..3a4c52410e9 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_ = nullptr; } + void clear() { holder_.reset(); } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 6c1b2f329a5..d8410b40586 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -59,7 +59,21 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto ctx = executor.Prepare(*program, block->ID()); + auto &skip_eager_deletion_vars = + Attr>("skip_eager_deletion_vars"); + if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { + std::string debug_string = + "Skip " + std::to_string(skip_eager_deletion_vars.size()) + + " vars in eager deletion mode: "; + for (auto &var : skip_eager_deletion_vars) { + debug_string.append(var); + debug_string.push_back(' '); + } + VLOG(10) << debug_string; + } + + auto ctx = + executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -96,6 +110,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr>("skip_eager_deletion_vars", + "Vars that would skip eager deletion." + "Users should not set this manually.") + .SetDefault(std::vector()); AddComment(R"DOC( )DOC"); } @@ -341,6 +359,30 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); + /* The following codes are used in eager deletion mode */ + if (framework::GetEagerDeletionThreshold() >= 0) { + std::unordered_set skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + // If input var of ops inside grad_block is not from grad_block, + // it cannot be deleted when forward while_op runs + if (in_arg_name != framework::kEmptyVarName && + !grad_block->HasVar(in_arg_name)) { + skip_vars.insert(in_arg_name); + } + } + } + + if (!skip_vars.empty()) { + // FIXME(zjl): ugly const_cast here, maybe we should find a better way + // to modify forward while_op + auto &fwd_while_op = const_cast(ForwardOp()); + fwd_while_op.SetAttr( + "skip_eager_deletion_vars", + std::vector(skip_vars.begin(), skip_vars.end())); + } + } + return std::unique_ptr(while_grad); } }; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae12..7fc07efe730 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -16,6 +16,7 @@ #include +#include #include // NOLINT #include #include @@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + thread_num_ = std::min(file_list_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { - read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[thread_id], slots_, batch_size_, + static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3edd7279780..37453a8c29f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -223,14 +223,10 @@ class CUDADeviceContext : public DeviceContext { template void AddStreamCallback(Callback&& callback) const { - std::lock_guard guard(callback_mtx_); callback_manager_->AddCallback(callback); } - void WaitStreamCallback() const { - std::lock_guard guard(callback_mtx_); - callback_manager_->Wait(); - } + void WaitStreamCallback() const { callback_manager_->Wait(); } #if CUDA_VERSION >= 9000 /*! \brief CublasCall may need to change cublas's config, @@ -261,9 +257,7 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex mtx_; - // This lock is only used by callback - // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes - mutable std::mutex callback_mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; mutable std::mutex cublas_mtx_; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index ae915365f8c..58ec6f2f5d2 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -18,52 +18,47 @@ namespace paddle { namespace platform { -struct StreamCallbackContext { - inline StreamCallbackContext(const StreamCallbackManager *manager, - std::function callback) - : manager_(manager), callback_(std::move(callback)) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; +#if CUDA_VERSION >= 10000 +static void CUDART_CB StreamCallbackFunc(void *user_data); +#else +static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, void *user_data) +#endif +{ + std::unique_ptr> func( + reinterpret_cast *>(user_data)); + (*func)(); +} StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) - : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + : stream_(stream), thread_pool_(1) {} void StreamCallbackManager::AddCallback(std::function callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::move(callback)); + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); #if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); #else - PADDLE_ENFORCE( - cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); + PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif } -void StreamCallbackManager::Wait() const { - thread_pool_.reset(new ::ThreadPool(1)); -} +StreamCallbackManager::~StreamCallbackManager() { Wait(); } -#if CUDA_VERSION >= 10000 -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) -#else -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, - void *user_data) -#endif -{ - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue( - [callback_context_ptr]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); +void StreamCallbackManager::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } } } // namespace platform diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index eac4806d137..0d5d85bf465 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,30 +18,32 @@ #include #include #include +#include // NOLINT #include +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -// NOTE(zjl): clean StreamCallback to make compilation faster +// NOTE(zjl): clean StreamCallbackManager to make compilation faster +// Make StreamCallbackManager thread-safe class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); + ~StreamCallbackManager(); + void AddCallback(std::function callback) const; void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr<::ThreadPool> thread_pool_; - -#if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data); -#else - static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data); -#endif + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; }; } // namespace platform diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 02a75236f6c..24800e17098 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f7fefb3e5b7..2690149e9b4 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,8 +116,9 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", - 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') -- GitLab From 35a2578426840642acc0b2100be0b1c96c2cf1e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 13:21:49 +0000 Subject: [PATCH 0031/2367] fix bug test=develop --- .../framework/details/computation_op_handle.cc | 2 -- .../framework/details/reference_count_pass.cc | 14 +++++++++----- paddle/fluid/platform/stream_callback_manager.cc | 2 -- paddle/fluid/platform/stream_callback_manager.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2bf43fd4e0f..7beb8c8de9f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,8 +31,6 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); - VLOG(10) << "Run Op" << Name(); - auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 2320d3926ad..0c096e09800 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,7 +29,7 @@ namespace paddle { namespace framework { namespace details { -struct OpConnectionDetector { +class OpConnectionDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; @@ -37,8 +37,8 @@ struct OpConnectionDetector { : graph_(all_ops) {} template - std::unordered_set MaxNoDepOps( - const OpSet &op_set) { + OpSet MaxNoDepOps(const OpSet &op_set) { + if (op_set.size() <= 1) return op_set; using KeyType = typename OpSet::key_type; static_assert( std::is_base_of ops(op_set.begin(), op_set.end()); - std::unordered_set ret; + OpSet ret; auto rels = GetRelations(ops); auto not_before = [](RelationShip r) { return r != kBefore; }; for (size_t i = 0; i < rels.size(); ++i) { @@ -79,7 +79,7 @@ struct OpConnectionDetector { auto it = op_to_idx.find(op); if (it != op_to_idx.end()) { size_t j = it->second; - if (ret[i][j] != kSame) { + if (i != j && ret[i][j] == kSame) { ret[i][j] = kBefore; ret[j][i] = kAfter; found_num += 2; @@ -208,6 +208,10 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( VLOG(10) << "Shrink last living op number of " << var_name << " from " << original_size << " to " << last_live_op.size(); } + + PADDLE_ENFORCE(!last_live_op.empty(), + "Last living ops of %s cannot be empty", var_name); + ref_cnts[i].emplace(var_name, last_live_op.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 58ec6f2f5d2..466c77469ef 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -49,8 +49,6 @@ void StreamCallbackManager::AddCallback(std::function callback) const { #endif } -StreamCallbackManager::~StreamCallbackManager() { Wait(); } - void StreamCallbackManager::Wait() const { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); { diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0d5d85bf465..8668bcb1131 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -33,7 +33,7 @@ class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); - ~StreamCallbackManager(); + ~StreamCallbackManager() = default; void AddCallback(std::function callback) const; -- GitLab From 2301abc481bbcfce1f87a102c295df8eeb4ba6c4 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 4 Dec 2018 11:32:10 +0800 Subject: [PATCH 0032/2367] cc libaray add pslib --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9f5631b87cb..8556dcbc36c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) -- GitLab From daba57f752b72be55fc5cdad86de2d5f52bb261c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Dec 2018 16:50:59 +0800 Subject: [PATCH 0033/2367] complete ctr_reader --- .../operators/reader/create_ctr_reader_op.cc | 5 ++- paddle/fluid/operators/reader/ctr_reader.cc | 6 +-- paddle/fluid/operators/reader/ctr_reader.h | 36 +++++++++++++++- paddle/fluid/operators/reader/read_op.cc | 41 +++++++++++------- .../operators/reader/reader_op_registry.cc | 34 +++++++++------ paddle/fluid/pybind/pybind.cc | 1 + python/paddle/fluid/contrib/__init__.py | 3 ++ .../paddle/fluid/contrib/reader/__init__.py | 19 +++++++++ .../paddle/fluid/contrib/reader/ctr_reader.py | 42 +++++++++++++++---- python/setup.py.in | 1 + 10 files changed, 143 insertions(+), 45 deletions(-) create mode 100644 python/paddle/fluid/contrib/reader/__init__.py diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 5b9e2ba693f..2a3e80c9152 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -51,6 +51,7 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto file_list = Attr>("file_list"); DataDesc data_desc(batch_size, file_list, file_type, file_format, dense_slot_index, sparse_slot_index, sparse_slots); + VLOG(1) << data_desc; out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, data_desc)); } @@ -69,10 +70,10 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "The list of files that need to read"); AddAttr>( "dense_slot_index", - "the sparse slots id that should be extract from file") + "the dense slots id that should be extract from file") .SetDefault({}); AddAttr>( - "dense_slot_index", + "sparse_slot_index", "the sparse slots id that should be extract from file") .SetDefault({}); AddAttr>("sparse_slots", diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 3595d771b40..946f17750e1 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -157,8 +157,8 @@ void MonitorThread(std::vector* thread_status, } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(3) << "all reader thread is stopped, push empty data into queue"; - queue->Push({}); + VLOG(3) << "all reader thread is stopped, close the queue"; + queue->Close(); VLOG(3) << "monitor thread exited"; } @@ -247,7 +247,7 @@ static inline void parse_csv_line( int slot_idx = data_desc.dense_slot_index_[i]; auto& slot_data = ret[slot_idx]; std::vector data_in_slot_str; - string_split(ret[slot_idx], ',', &data_in_slot_str); + string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { (*dense_datas)[i].push_back(std::stof(data_str)); diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 1f4663e3b89..eef6c11aaad 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -60,6 +60,35 @@ struct DataDesc { const std::vector sparse_slot_ids_; }; +inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) { + os << "data_desc:\n"; + os << "\tbatch_size -> " << data_desc.batch_size_ << "\n"; + os << "\tfile_type -> " << data_desc.file_type_ << "\n"; + os << "\tfile_format -> " << data_desc.file_format_ << "\n"; + os << "\tfile_names -> {"; + for (auto& file_name : data_desc.file_names_) { + os << file_name << ","; + } + os << "}\n"; + os << "\tdense_slot_index -> {"; + for (auto& slot : data_desc.dense_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_index_ -> {"; + for (auto& slot : data_desc.sparse_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_ids_ -> {"; + for (auto& slot : data_desc.sparse_slot_ids_) { + os << slot << ","; + } + os << "}\n"; + + return os; +} + void ReadThread(const std::vector& file_list, const DataDesc& data_desc, int thread_id, std::vector* thread_status, @@ -89,7 +118,7 @@ class CTRReader : public framework::FileReader { } } - ~CTRReader() {} + ~CTRReader() { Shutdown(); } void ReadNext(std::vector* out) override { bool success; @@ -106,7 +135,10 @@ class CTRReader : public framework::FileReader { for (auto& read_thread : read_threads_) { read_thread->join(); } - monitor_thread_->join(); + + if (monitor_thread_) { + monitor_thread_->join(); + } read_threads_.clear(); monitor_thread_.reset(nullptr); diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index a0b70938d35..97faade0428 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -27,15 +27,16 @@ class ReadInferShape : public framework::InferShapeBase { "The ReadOp must take a reader as input."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "The ReadOp should be assigned with output."); - std::vector reader_dims = ctx->GetReaderDims("Reader"); - std::vector out_names = ctx->Outputs("Out"); - PADDLE_ENFORCE_EQ( - reader_dims.size(), out_names.size(), - "The reader's dim number doesn't match the output number."); - ctx->SetOutputsDim("Out", reader_dims); - if (!ctx->IsRuntime()) { + if (!ctx->IsRuntime() && ctx->Attrs().Get("infer_out")) { + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); + std::cout << in_desc->Proto()->SerializeAsString() << std::endl; auto in_lod_levels = in_desc->GetLoDLevels(); auto out_var_ptrs = ctx->GetOutputVarPtrs("Out"); PADDLE_ENFORCE_EQ(in_lod_levels.size(), out_var_ptrs.size(), @@ -53,15 +54,18 @@ class ReadInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - std::string reader_name = op_desc.Input("Reader")[0]; - std::vector out_names = op_desc.Output("Out"); - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - auto dtypes = reader->GetDataTypes(); - PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); - for (size_t i = 0; i < dtypes.size(); ++i) { - framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetDataType(dtypes[i]); + bool infer_out = boost::get(op_desc.GetAttr("infer_out")); + if (infer_out) { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarType::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } } } }; @@ -73,6 +77,7 @@ class ReadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + VLOG(3) << "read op in"; framework::ReaderHolder* reader = detail::Ref(scope.FindVar(Input("Reader")), "Cannot find reader variable %s", Input("Reader")) @@ -87,7 +92,9 @@ class ReadOp : public framework::OperatorBase { reader->ReadNext(&ins); if (ins.empty()) { + VLOG(3) << "read empty data in"; if (Attr("throw_eof_exp")) { + VLOG(3) << "throw_eof_exp"; PADDLE_THROW_EOF(); } else { ins.resize(out_arg_names.size()); @@ -96,6 +103,7 @@ class ReadOp : public framework::OperatorBase { tensor.mutable_data(framework::make_ddim({0}), dev_place); } } + VLOG(3) << "read empty data out"; } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { @@ -120,6 +128,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { " only when the data-balance is enabled in ParallelExecutor" " and it is set by ParallelExecutor instance, not users.") .SetDefault(true); + AddAttr("infer_out", "").SetDefault(true); AddComment(R"DOC( Read Operator diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index b82aab12149..3921eedf94a 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() { "It means the reader will generate two data each time," "whose shapes are [2,3,4] and [5,6] respectively."); AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr( + "use_data_config", + "Use the config of all datas like shape_concat/ranks/lod_levels") + .SetDefault(true); Apply(); } @@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); - const auto shape_concat = ctx->Attrs().Get>("shape_concat"); - const auto ranks = ctx->Attrs().Get>("ranks"); - std::vector shapes = RestoreShapes(shape_concat, ranks); - ctx->SetReaderDims("Out", shapes); - - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); + bool use_data_config = ctx->Attrs().Get("use_data_config"); + if (use_data_config) { + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f0a5d1afc97..681b213b466 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -364,6 +364,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference); py::class_(m, "Reader", "") + .def("start", &framework::ReaderHolder::Start) .def("reset", &framework::ReaderHolder::ResetAll); using LoDTensorBlockingQueue = diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 3bf2fe5db0c..5d4b1577275 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -22,9 +22,12 @@ from . import op_frequence from .op_frequence import * from . import quantize from .quantize import * +from . import reader +from .reader import * __all__ = [] __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ __all__ += quantize.__all__ +__all__ += reader.__all__ diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py new file mode 100644 index 00000000000..4cf85ffc166 --- /dev/null +++ b/python/paddle/fluid/contrib/reader/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import ctr_reader + +__all__ = ctr_reader.__all__ diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index d7133562de4..aad8ded87da 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \ default_startup_program, Variable from paddle.fluid.unique_name import generate as unique_name +__all__ = ['ctr_reader'] + def monkey_patch_reader_methods(reader): def __get_reader__(): @@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader): def reset(): return __get_reader__().reset() + def start(): + return __get_reader__().start() + reader.reset = reset + reader.start = start reader.stop_gradient = True reader.persistable = True return reader @@ -44,13 +50,18 @@ def _copy_reader_var_(block, var): return new_var -def ctr_reader(feed_data, - capacity, - thread_num, - batch_size, - file_list, - slots, - name=None): +def ctr_reader( + feed_dict, + file_type, # gzip or plain + file_format, # csv or svm + dense_slot_indexs, + sparse_slot_indexs, + capacity, + thread_num, + batch_size, + file_list, + slots, + name=None): """ Create a CTR reader for data feeding in Python @@ -99,12 +110,22 @@ def ctr_reader(feed_data, inputs={'blocking_queue': [queue_name]}, outputs={'Out': [reader_var]}, attrs={ + 'use_data_config': False, 'thread_num': thread_num, 'batch_size': batch_size, 'file_list': file_list, - 'slots': slots, + 'file_type': file_type, + 'file_format': file_format, + 'dense_slot_index': dense_slot_indexs, + 'sparse_slot_index': sparse_slot_indexs, + 'sparse_slots': slots, + 'ranks': [], + 'lod_levels': [], + 'shape_concat': [] }) + dtypes = [data.dtype for data in feed_dict] + reader_var.desc.set_dtypes(dtypes) reader_var.persistable = True main_prog_reader_var = _copy_reader_var_( @@ -118,6 +139,9 @@ def ctr_reader(feed_data, main_blk = default_main_program().current_block() main_blk.append_op( - type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data}) + type='read', + inputs={'Reader': [reader]}, + attrs={'infer_out': False}, + outputs={'Out': feed_dict}) return reader diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54e..d5d82f643e7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -107,6 +107,7 @@ packages=['paddle', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.reader', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details'] -- GitLab From 7b7fe01cae07393430d6f3062497dff19233eeba Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Dec 2018 16:58:02 +0800 Subject: [PATCH 0034/2367] optimize code --- paddle/fluid/operators/reader/ctr_reader.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 946f17750e1..65b300d152f 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -101,16 +101,16 @@ class GzipReader : public Reader { class PlainFileReader : public Reader { public: explicit PlainFileReader(const std::string& file_name) - : myfile_(file_name.c_str()) {} + : stream_(file_name.c_str()) {} ~PlainFileReader() {} - bool HasNext() override { return myfile_.peek() != EOF; } + bool HasNext() override { return stream_.peek() != EOF; } - void NextLine(std::string* line) override { std::getline(myfile_, *line); } + void NextLine(std::string* line) override { std::getline(stream_, *line); } private: - std::ifstream myfile_; + std::ifstream stream_; }; template -- GitLab From 2d0d037d8e9e1580d38e800fd1a0d0b0056422eb Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 0035/2367] fix while_op eager deletion bug add unittest test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 5 files changed, 140 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034a..f443c2d8cf6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b40586..da7cad82d8d 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 00000000000..7ec1f0ae753 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 00000000000..2dcdbdb8f13 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 00000000000..754d5fd4095 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() -- GitLab From e694d0c2e487a854103e0cc4796f92af6d27ccfd Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 0036/2367] fix while_op eager deletion bug add unittest test=develop --- .../details/eager_deletion_op_handle.cc | 2 + paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 6 files changed, 142 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 41f616035d7..54715fed8d9 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,9 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034a..f443c2d8cf6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b40586..da7cad82d8d 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 00000000000..7ec1f0ae753 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 00000000000..2dcdbdb8f13 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 00000000000..754d5fd4095 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() -- GitLab From 208f9125121bf5e7e314de787aa0802aa2e2f2bd Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Fri, 30 Nov 2018 15:24:04 +0100 Subject: [PATCH 0037/2367] Implement MKL-DNN Concat test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 217 ++++++++++++++++++ paddle/fluid/operators/concat_op.cc | 20 ++ .../tests/unittests/test_concat_mkldnn_op.py | 56 +++++ 3 files changed, 293 insertions(+) create mode 100644 paddle/fluid/operators/concat_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc new file mode 100644 index 00000000000..c6652b78851 --- /dev/null +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::concat; +using mkldnn::stream; +using platform::to_void_cast; + +// Generate keys for storing/retriving primitives for this operator +// TODO(jczaja): Make hashing function more optimial +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + + dims2str(paddings) + pooling_type + suffix; +} + +static void EnforceLayouts(const std::vector inputs) { + for (auto* input : inputs) { + const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; + const bool is_format_defined = input->format() != + memory::format::format_undef; + PADDLE_ENFORCE(is_layout_correct && is_format_defined, + "Wrong layout/format set for Input tensor"); + } +} + +static memory::primitive_desc CreateMemPrimDesc( + const framework::Tensor& input, const mkldnn::engine& engine) { + constexpr auto data_type = mkldnn::memory::f32; + const auto dims = paddle::framework::vectorize2int(input.dims()); + const auto format = input.format(); + auto description = memory::desc(dims, data_type, format); + auto mem_prim_desc = memory::primitive_desc(description, engine); + return mem_prim_desc; +} + +static platform::CPUPlace GetCpuPlace( + const paddle::framework::ExecutionContext& ctx) { + auto place = ctx.GetPlace(); + PADDLE_ENFORCE(paddle::platform::is_cpu_place(place), + "It must use CPUPlace."); + return boost::get(place); +} + +template +class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto place = GetCpuPlace(ctx); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto multi_input = ctx.MultiInput("X"); + framework::Tensor* output = ctx.Output("Out"); + int64_t concat_axis = static_cast(ctx.Attr("axis")); + + EnforceLayouts(multi_input); + + std::vector srcs_pd; + std::vector srcs; + for (size_t i = 0; i < multi_input.size(); i++) { + auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + srcs_pd.push_back(mem_prim_desc); + srcs.push_back(memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); + } + auto dst_dims = paddle::framework::vectorize2int(output->dims()); + auto dst_desc = memory::desc(dst_dims, mkldnn::memory::f32, memory::format::any); + auto concat_pd = concat::primitive_desc(dst_desc, static_cast(concat_axis), srcs_pd); + auto dst_mem = memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); + + std::vector inputs; //= {srcs}; + inputs.reserve(srcs.size()); + for (size_t i = 0; i < srcs.size(); i++) { + inputs.push_back(srcs[i]); + } + auto concat_prim = concat(concat_pd, inputs, dst_mem); + + std::vector pipeline; + pipeline.push_back(concat_prim); + stream(stream::kind::eager).submit(pipeline).wait(); // TODO(mgallus): When this is not workin' split into decl and def + + /* + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + + const std::string key = gethash(src_tz, pooling_type, ksize, strides, + paddings, ctx.op().Output("Out")); + const std::string key_pool_p = key + "@pool_p"; + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + auto pool_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); + if (pool_p == nullptr) { + const std::vector& padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + bool ceil_mode = ctx.Attr("ceil_mode"); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + padding_right_bottom); + } + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); + + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, + padding_right_bottom, ksize, pooling_type, + mkldnn_engine, ceil_mode, is_test); + + // save pool_pd into global device context to be referred in backward path + if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); + + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); + + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + if (is_test) { + pool_p = std::make_shared(*pool_pd, *src_memory, + *dst_memory); + } else { + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + pool_p = std::make_shared( + *pool_pd, *src_memory, *dst_memory, *workspace_memory); + } + + dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; + } else { + // Primitives already exist + auto pool_src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(pool_src_memory_p != nullptr, + "Fail to find pooling src mem_p in device context"); + auto pool_dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(pool_dst_memory_p != nullptr, + "Fail to find pooling dst mem_p in device context"); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); + pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; + } + + // push primitive to stream and wait until it's executed + std::vector pipeline{*(pool_p.get())}; + stream(stream::kind::eager).submit(pipeline).wait(); + */ + output->mutable_data(place); + output->set_layout(DataLayout::kMKLDNN); + output->set_format((memory::format)dst_mem.get_primitive_desc().desc() + .data.format); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatMKLDNNOpKernel) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 57817da71ad..7e58f9cde13 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include namespace paddle { namespace operators { @@ -59,6 +60,21 @@ class ConcatOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); + + #ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } + #endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { @@ -66,6 +82,9 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensors of concat operator.").AsDuplicable(); AddOutput("Out", "Output tensor of concat operator."); + AddAttr("use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddAttr("axis", "The axis along which the input tensors will be concatenated.") .SetDefault(0); @@ -82,6 +101,7 @@ Examples: [5,6]] )DOC"); + } }; diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py new file mode 100644 index 00000000000..c590687a24d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 + + +class TestMKLDNNConcatOp(TestConcatOp): + def setUp(self): + super(TestMKLDNNConcatOp, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + +class TestMKLDNNConcatOp2(TestConcatOp2): + def setUp(self): + super(TestMKLDNNConcatOp2, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + +class TestMKLDNNConcatOp3(TestConcatOp3): + def setUp(self): + super(TestMKLDNNConcatOp3, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() -- GitLab From 738069e491d5649b39706aed2526622a1594332c Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Mon, 3 Dec 2018 14:45:27 +0100 Subject: [PATCH 0038/2367] Refactor MKL-DNN Concat test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 209 +++++++-------------- 1 file changed, 72 insertions(+), 137 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index c6652b78851..37b2788d63b 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -26,25 +27,6 @@ using mkldnn::concat; using mkldnn::stream; using platform::to_void_cast; -// Generate keys for storing/retriving primitives for this operator -// TODO(jczaja): Make hashing function more optimial -static std::string gethash(const memory::dims& input_dims, - const std::string& pooling_type, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string& suffix) { - auto dims2str = [](const memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + - dims2str(paddings) + pooling_type + suffix; -} - static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; @@ -56,7 +38,7 @@ static void EnforceLayouts(const std::vector inputs) { } static memory::primitive_desc CreateMemPrimDesc( - const framework::Tensor& input, const mkldnn::engine& engine) { + const Tensor& input, const mkldnn::engine& engine) { constexpr auto data_type = mkldnn::memory::f32; const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); @@ -65,6 +47,11 @@ static memory::primitive_desc CreateMemPrimDesc( return mem_prim_desc; } +static mkldnn::memory::format GetDstMemFormat( + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; +} + static platform::CPUPlace GetCpuPlace( const paddle::framework::ExecutionContext& ctx) { auto place = ctx.GetPlace(); @@ -73,139 +60,87 @@ static platform::CPUPlace GetCpuPlace( return boost::get(place); } -template -class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto place = GetCpuPlace(ctx); +static const mkldnn::engine& GetMKLDNNEngine( + const paddle::framework::ExecutionContext& ctx) { auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); + return dev_ctx.GetEngine(); +} - auto multi_input = ctx.MultiInput("X"); - framework::Tensor* output = ctx.Output("Out"); - int64_t concat_axis = static_cast(ctx.Attr("axis")); +template +class ConcatPrimitiveFactory { + public: + concat::primitive_desc CreateConcatPrimDescriptor( + const std::vector multi_input, Tensor* output, + int concat_axis, const mkldnn::engine& mkldnn_engine) { + CreateSourcesDescriptors(multi_input, mkldnn_engine); + auto dst_desc = CreateDstMemDescriptor(output); + return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); + } - EnforceLayouts(multi_input); + concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + CreateSourcePrimitiveAts(); + auto dst_mem = CreateDstMemory(concat_pd, output, place); + return concat(concat_pd, inputs, dst_mem); + } + + private: + memory::desc CreateDstMemDescriptor(Tensor* output) { + auto dst_dims = paddle::framework::vectorize2int(output->dims()); + return memory::desc(dst_dims, platform::MKLDNNGetDataType(), + memory::format::any); + } + + mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + return memory(concat_pd.dst_primitive_desc(), + output->mutable_data(place)); + } - std::vector srcs_pd; - std::vector srcs; + void CreateSourcesDescriptors(const std::vector multi_input, + const mkldnn::engine& mkldnn_engine) { for (size_t i = 0; i < multi_input.size(); i++) { auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); srcs_pd.push_back(mem_prim_desc); - srcs.push_back(memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); + srcs.push_back(memory(mem_prim_desc, + to_void_cast(multi_input[i]->data()))); } - auto dst_dims = paddle::framework::vectorize2int(output->dims()); - auto dst_desc = memory::desc(dst_dims, mkldnn::memory::f32, memory::format::any); - auto concat_pd = concat::primitive_desc(dst_desc, static_cast(concat_axis), srcs_pd); - auto dst_mem = memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); + } - std::vector inputs; //= {srcs}; + void CreateSourcePrimitiveAts() { inputs.reserve(srcs.size()); for (size_t i = 0; i < srcs.size(); i++) { inputs.push_back(srcs[i]); } - auto concat_prim = concat(concat_pd, inputs, dst_mem); - - std::vector pipeline; - pipeline.push_back(concat_prim); - stream(stream::kind::eager).submit(pipeline).wait(); // TODO(mgallus): When this is not workin' split into decl and def - - /* - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - - auto input_format = input->format(); - memory::format output_format{memory::format::format_undef}; - - const std::string key = gethash(src_tz, pooling_type, ksize, strides, - paddings, ctx.op().Output("Out")); - const std::string key_pool_p = key + "@pool_p"; - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; - const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; - - auto pool_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); - if (pool_p == nullptr) { - const std::vector& padding_left_top(paddings); - std::vector padding_right_bottom(paddings); - bool ceil_mode = ctx.Attr("ceil_mode"); - if (ceil_mode) { - CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, - padding_right_bottom); - } - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), input_format); - - auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::any); - - std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, - padding_right_bottom, ksize, pooling_type, - mkldnn_engine, ceil_mode, is_test); - - // save pool_pd into global device context to be referred in backward path - if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); - - auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), - to_void_cast(input_data)); - auto dst_memory = - std::make_shared(pool_pd->dst_primitive_desc(), output_data); - - dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); - dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); - - if (is_test) { - pool_p = std::make_shared(*pool_pd, *src_memory, - *dst_memory); - } else { - std::shared_ptr workspace_memory = - CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); - - // save pool_workspace_memory to be referred in backward path - dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - - pool_p = std::make_shared( - *pool_pd, *src_memory, *dst_memory, *workspace_memory); - } - - dev_ctx.SetBlob(key_pool_p, pool_p); - - output_format = - (memory::format)dst_memory->get_primitive_desc().desc().data.format; - } else { - // Primitives already exist - auto pool_src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); - PADDLE_ENFORCE(pool_src_memory_p != nullptr, - "Fail to find pooling src mem_p in device context"); - auto pool_dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); - PADDLE_ENFORCE(pool_dst_memory_p != nullptr, - "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle(to_void_cast(input_data)); - pool_dst_memory_p->set_data_handle(output_data); - - output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() - .desc() - .data.format; - } + } + + private: + std::vector srcs_pd; + std::vector srcs; + std::vector inputs; +}; + +template +class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto place = GetCpuPlace(ctx); + const auto& mkldnn_engine = GetMKLDNNEngine(ctx); + + auto multi_input = ctx.MultiInput("X"); + EnforceLayouts(multi_input); + Tensor* output = ctx.Output("Out"); + int64_t concat_axis = static_cast(ctx.Attr("axis")); + + ConcatPrimitiveFactory prim_creator; + auto concat_pd = prim_creator.CreateConcatPrimDescriptor(multi_input, + output, static_cast(concat_axis), mkldnn_engine); + auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); + stream(stream::kind::eager).submit({concat}).wait(); - // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_p.get())}; - stream(stream::kind::eager).submit(pipeline).wait(); - */ - output->mutable_data(place); output->set_layout(DataLayout::kMKLDNN); - output->set_format((memory::format)dst_mem.get_primitive_desc().desc() - .data.format); + output->set_format(GetDstMemFormat(concat_pd)); } }; } // namespace operators -- GitLab From f2a880421ebdfb6e0c9b2b3809d8ba8449b09ea2 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 10:10:02 +0100 Subject: [PATCH 0039/2367] Fix style @ concat integration and tests test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 33 +++++++++---------- paddle/fluid/operators/concat_op.cc | 33 ++++++++++--------- .../tests/unittests/test_concat_mkldnn_op.py | 2 ++ 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index 37b2788d63b..b8456aac9da 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -30,15 +30,15 @@ using platform::to_void_cast; static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; - const bool is_format_defined = input->format() != - memory::format::format_undef; + const bool is_format_defined = + input->format() != memory::format::format_undef; PADDLE_ENFORCE(is_layout_correct && is_format_defined, "Wrong layout/format set for Input tensor"); } } -static memory::primitive_desc CreateMemPrimDesc( - const Tensor& input, const mkldnn::engine& engine) { +static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, + const mkldnn::engine& engine) { constexpr auto data_type = mkldnn::memory::f32; const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); @@ -48,8 +48,8 @@ static memory::primitive_desc CreateMemPrimDesc( } static mkldnn::memory::format GetDstMemFormat( - const concat::primitive_desc& concat_pd) { - return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; } static platform::CPUPlace GetCpuPlace( @@ -61,10 +61,9 @@ static platform::CPUPlace GetCpuPlace( } static const mkldnn::engine& GetMKLDNNEngine( - const paddle::framework::ExecutionContext& ctx) { - auto& dev_ctx = - ctx.template device_context(); - return dev_ctx.GetEngine(); + const paddle::framework::ExecutionContext& ctx) { + auto& dev_ctx = ctx.template device_context(); + return dev_ctx.GetEngine(); } template @@ -89,7 +88,7 @@ class ConcatPrimitiveFactory { memory::desc CreateDstMemDescriptor(Tensor* output) { auto dst_dims = paddle::framework::vectorize2int(output->dims()); return memory::desc(dst_dims, platform::MKLDNNGetDataType(), - memory::format::any); + memory::format::any); } mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, @@ -101,10 +100,10 @@ class ConcatPrimitiveFactory { void CreateSourcesDescriptors(const std::vector multi_input, const mkldnn::engine& mkldnn_engine) { for (size_t i = 0; i < multi_input.size(); i++) { - auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); - srcs_pd.push_back(mem_prim_desc); - srcs.push_back(memory(mem_prim_desc, - to_void_cast(multi_input[i]->data()))); + auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + srcs_pd.push_back(mem_prim_desc); + srcs.push_back( + memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); } } @@ -134,8 +133,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { int64_t concat_axis = static_cast(ctx.Attr("axis")); ConcatPrimitiveFactory prim_creator; - auto concat_pd = prim_creator.CreateConcatPrimDescriptor(multi_input, - output, static_cast(concat_axis), mkldnn_engine); + auto concat_pd = prim_creator.CreateConcatPrimDescriptor( + multi_input, output, static_cast(concat_axis), mkldnn_engine); auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); stream(stream::kind::eager).submit({concat}).wait(); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 7e58f9cde13..7466107cf2a 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" +#include #include #include -#include namespace paddle { namespace operators { @@ -63,18 +63,19 @@ class ConcatOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input_data_type = framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); - - #ifdef PADDLE_WITH_MKLDNN - if (platform::CanMKLDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } - #endif - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { @@ -82,9 +83,10 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensors of concat operator.").AsDuplicable(); AddOutput("Out", "Output tensor of concat operator."); - AddAttr("use_mkldnn", - "(bool, default false) Indicates if MKL-DNN kernel will be used") - .SetDefault(false); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddAttr("axis", "The axis along which the input tensors will be concatenated.") .SetDefault(0); @@ -101,7 +103,6 @@ Examples: [5,6]] )DOC"); - } }; diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py index c590687a24d..0ea44c0e4e4 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -29,6 +29,7 @@ class TestMKLDNNConcatOp(TestConcatOp): def init_kernel_type(self): self.use_mkldnn = True + class TestMKLDNNConcatOp2(TestConcatOp2): def setUp(self): super(TestMKLDNNConcatOp2, self).setUp() @@ -40,6 +41,7 @@ class TestMKLDNNConcatOp2(TestConcatOp2): def init_kernel_type(self): self.use_mkldnn = True + class TestMKLDNNConcatOp3(TestConcatOp3): def setUp(self): super(TestMKLDNNConcatOp3, self).setUp() -- GitLab From 6fdbb365ce8e726fdc22665bd65c9c2e4ae43859 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 13:07:51 +0100 Subject: [PATCH 0040/2367] Include MKL-DNN header to concat op only when flag is set test=develop --- paddle/fluid/operators/concat_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 7466107cf2a..194f9cf5033 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,11 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" - -#include #include #include +#ifdef PADDLE_WITH_MKLDNN +#include +#endif + namespace paddle { namespace operators { using framework::Tensor; -- GitLab From 49130f9b8f41cda0ac50e5c57f4b033c260c7541 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 4 Dec 2018 20:15:52 +0800 Subject: [PATCH 0041/2367] refine downpour sgd API and adapt to pslib proto desc --- paddle/fluid/framework/CMakeLists.txt | 2 +- .../paddle/fluid/distribute_lookup_table.py | 18 ++++++++ python/paddle/fluid/distributed/downpour.py | 19 +++++--- python/paddle/fluid/distributed/node.py | 45 +++++++++---------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9f5631b87cb..8556dcbc36c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index 52d9ce75f8d..a903257fa94 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -15,6 +15,24 @@ LOOKUP_TABLE_TYPE = "lookup_table" +def find_distributed_lookup_table_inputs(program, table_name): + local_vars = program.current_block().vars + inputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + inputs.extend([local_vars[name] for name in op.input("Ids")]) + return inputs + +def find_distributed_lookup_table_outputs(program, table_name): + local_vars = program.current_block().vars + outputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + outputs.extend([local_vars[name] for name in op.output("Out")]) + return outputs + def find_distributed_lookup_table(program): """ Find distribute lookup table in program. diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 551a4714950..3fe4afdbffb 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -3,6 +3,8 @@ from .node import DownpourWorker from ..backward import append_backward import ps_pb2 as pslib from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs from google.protobuf import text_format class DownpourSGD(object): @@ -12,21 +14,24 @@ class DownpourSGD(object): self.window_ = window def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None, - prefetch_slots=None, prefetch_slots_emb=None): + parameter_list=None, no_grad_set=None): params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) table_name = find_distributed_lookup_table(loss.block.program) + prefetch_slots = find_distributed_lookup_table_inputs( + loss.block.program, table_name) + prefetch_slots_emb = find_distributed_lookup_table_outputs( + loss.block.program, table_name) server = DownpourServer() worker = DownpourWorker(self.window_) - server.add_sparse_table(0, learning_rate, + server.add_sparse_table(0, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(1, learning_rate, params, grads) - worker.add_sparse_table(0, learning_rate, + server.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) + worker.add_sparse_table(0, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(1, learning_rate, params, grads) + worker.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) #ps_param.worker_param.CopyFrom(worker.get_desc()) worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops] + return [ps_param_str, worker_skipped_ops, text_format.MessageToString(worker.get_desc())] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 3344bba137e..7c9a76efb69 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -16,25 +16,26 @@ class DownpourServer(Server): self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, - slot_key, slot_value_var, slot_grad_var): - #table = self.server_.downpour_table_param.add() + slot_key_vars, slot_value_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id - table.type = PS_SPARSE_TABLE + table.type = pslib.PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.fea_dim = slot_value_var[0].shape[1] + table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, + slot_value_var[0].shape, 1)) def add_dense_table(self, table_id, learning_rate, param_var, grad_var): - #table = self.server_.downpour_table_param.add() table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id - table.type = PS_DENSE_TABLE + table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" table.accessor.sparse_sgd_param.learning_rate = learning_rate - table.accessor.fea_dim = 1 - #table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + fea_dim = 0 + for param in param_var: + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + table.accessor.fea_dim = fea_dim def get_desc(self): return self.server_ @@ -43,28 +44,24 @@ class DownpourServer(Server): class DownpourWorker(Worker): def __init__(self, window): self.window = window - #self.worker_ = pslib.WorkerParameter().downpour_worker_param - #self.worker_ = pslib.WorkerParameter() self.worker_ = pslib.DownpourTrainerParameter() - #self.worker_.pull_dense_per_batch = window - #self.worker_.push_dense_per_batch = window - #self.worker_.downpour_worker_param.pull_dense_per_batch = window - #self.worker_.downpour_worker_param.push_dense_per_batch = window self.worker_.pull_dense_per_batch = window self.worker_.push_dense_per_batch = window - print(self.worker_) - def add_sparse_table(self, table_id, - slot_keys, slot_value_vars, slot_grad_vars): - #table = self.worker_.sparse_table.add() - table = self.worker_.downpour_worker_param.sparse_table.add() + def add_sparse_table(self, table_id, learning_rate, + slot_key_vars, slot_value_vars): + table = self.worker_.sparse_table.add() table.table_id = table_id - table.slot.extend(slot_keys) - self.worker_.extend([grad.name for grad in slot_grad_vars]) + table.slot_key.extend( + [var.name for var in slot_key_vars]) + table.slot_value.extend( + [var.name for var in slot_value_vars]) + table.slot_gradient.extend( + [var.name + "@GRAD" for var in slot_value_vars]) - def add_dense_table(self, table_id, param_vars, grad_vars): - #table = self.worker_.dense_table.add() - table = self.worker_.downpour_worker_param.dense_table.add() + def add_dense_table(self, table_id, learning_rate, + param_vars, grad_vars): + table = self.worker_.dense_table.add() table.table_id = table_id table.dense_variable_name.extend([p.name for p in param_vars]) table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) -- GitLab From 87eb8b0e28a4b95d8427bd66cbff33f107ad069e Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 14:44:01 +0100 Subject: [PATCH 0042/2367] Set cpu only for MKL-DNN concat UTs test=develop --- python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py index 0ea44c0e4e4..0f2130f9049 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -22,6 +22,7 @@ class TestMKLDNNConcatOp(TestConcatOp): def setUp(self): super(TestMKLDNNConcatOp, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass @@ -34,6 +35,7 @@ class TestMKLDNNConcatOp2(TestConcatOp2): def setUp(self): super(TestMKLDNNConcatOp2, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass @@ -46,6 +48,7 @@ class TestMKLDNNConcatOp3(TestConcatOp3): def setUp(self): super(TestMKLDNNConcatOp3, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass -- GitLab From 419506f510d258fa858c75a05cdcaa780105deca Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 4 Dec 2018 22:23:01 +0800 Subject: [PATCH 0043/2367] refine for compile pslib.so --- cmake/external/pslib.cmake | 2 +- cmake/external/pslib_brpc.cmake | 2 +- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/framework/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 586f66d6fdb..812af5efa20 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -66,7 +66,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} ) -ADD_LIBRARY(pslib STATIC IMPORTED GLOBAL) +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) LIST(APPEND external_project_dependencies pslib) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 7b4beeae65a..92019eef26f 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -41,7 +41,7 @@ SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) -SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libps.so) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103a..d980b36d9be 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +#add_subdirectory(distributed) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8556dcbc36c..6fdc73e93ae 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) -- GitLab From d0c8b9b9b350f774a7b195bf6c807b90b5f895f9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 12:00:28 +0000 Subject: [PATCH 0044/2367] remove timeout unittest test=develop --- paddle/fluid/framework/tensor.h | 2 +- .../test_eager_deletion_seresnext.py | 27 ------------------- 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 3a4c52410e9..71e8badd4b6 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_.reset(); } + void clear() { holder_ = nullptr; } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py deleted file mode 100644 index 2dcdbdb8f13..00000000000 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" - -from test_parallel_executor_seresnext import TestResnet - - -class EagerDeletionTestSEResNext(TestResnet): - pass - - -if __name__ == '__main__': - unittest.main() -- GitLab From 06213b798116f7aadb2ab95f83931c10b67a5942 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 4 Dec 2018 23:06:42 +0800 Subject: [PATCH 0045/2367] add hadoop helper function for distributed training --- python/paddle/fluid/async_executor.py | 9 +- .../paddle/fluid/distribute_lookup_table.py | 6 +- python/paddle/fluid/distributed/downpour.py | 45 +++++-- python/paddle/fluid/distributed/helper.py | 24 ++++ python/paddle/fluid/distributed/node.py | 1 - python/paddle/fluid/distributed/ps_pb2.py | 118 ++++++++++-------- 6 files changed, 134 insertions(+), 69 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2945e6e1436..c5863eb9e05 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -150,8 +150,13 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - def config_ps(self, dist_desc, host_sign_list, node_num, index): - self.executor.config_pslib(dist_desc, host_sign_list, node_num, index) + def config_distributed_nodes(self, dist_opt): + # get total rank + # get rank index + # get iplists + # get hadoop info + return + def start_server(self): self.executor.start_server() diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index a903257fa94..243d806c41a 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -21,7 +21,8 @@ def find_distributed_lookup_table_inputs(program, table_name): for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - inputs.extend([local_vars[name] for name in op.input("Ids")]) + inputs.extend( + [local_vars[name] for name in op.input("Ids")]) return inputs def find_distributed_lookup_table_outputs(program, table_name): @@ -30,7 +31,8 @@ def find_distributed_lookup_table_outputs(program, table_name): for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - outputs.extend([local_vars[name] for name in op.output("Out")]) + outputs.extend( + [local_vars[name] for name in op.output("Out")]) return outputs def find_distributed_lookup_table(program): diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 3fe4afdbffb..093792d5d60 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -8,30 +8,57 @@ from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_o from google.protobuf import text_format class DownpourSGD(object): + """ + Distributed optimizer of downpour stochastic gradient descent + Standard implementation of Google's Downpour SGD + in Large Scale Distributed Deep Networks + + Args: + learning_rate (float): the learning rate used to update parameters. \ + Can be a float value + Examples: + .. code-block:: python + + downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) + downpour_sgd.minimize(cost) + """ def __init__(self, learning_rate=0.001, window=1): - # todo(guru4elephant): if optimizer is not None, will warning here + # todo(guru4elephant): add more optimizers here as argument + # todo(guru4elephant): make learning_rate as a variable self.learning_rate_ = learning_rate self.window_ = window - + self.type = "downpour" + def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) + params_grads = sorted(append_backward( + loss, parameter_list, no_grad_set), key=lambda x:x[0].name) table_name = find_distributed_lookup_table(loss.block.program) prefetch_slots = find_distributed_lookup_table_inputs( loss.block.program, table_name) prefetch_slots_emb = find_distributed_lookup_table_outputs( loss.block.program, table_name) server = DownpourServer() + # window is communication strategy worker = DownpourWorker(self.window_) - server.add_sparse_table(0, self.learning_rate_, + # Todo(guru4elephant): support multiple tables definitions + # currently support one big sparse table + sparse_table_index = 0 + # currently merge all dense parameters into one dense table + dense_table_index = 1 + server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) - worker.add_sparse_table(0, self.learning_rate_, + server.add_dense_table(dense_table_index, self.learning_rate_, + params_grads[0], params_grads[1]) + worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) + worker.add_dense_table(dense_table_index, self.learning_rate_, + params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - #ps_param.worker_param.CopyFrom(worker.get_desc()) + ps_param.worker_param.CopyFrom(worker.get_desc()) + # Todo(guru4elephant): figure out how to support more sparse parameters + # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops, text_format.MessageToString(worker.get_desc())] + return [ps_param_str, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 8e079b1e8d9..12e2f7f197a 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,5 +1,27 @@ from mpi4py import MPI +class FileSystem(object): + def __init__(self, fs_type="afs", + uri="afs://tianqi.afs.baidu.com:9902", + user=None, + passwd=None, + hadoop_bin="", + afs_conf=None): + assert user not None + assert passwd not None + assert hadoop_bin not None + fs_client = pslib.FsClientParameter() + if fs_type == "afs": + fs_client.fs_type = pslib.FsApiType.AFS + else: + fs_client.fs_type = pslib.FsApiType.HDFS + fs_client.uri = uri + fs_client.user = user + fs_client.passwd = passwd + fs_client.buffer_size = 0 + fs_client.afs_conf = afs_conf if not afs_conf else "" + + class MPIHelper(object): def __init__(self): self.comm = MPI.COMM_WORLD @@ -18,3 +40,5 @@ class MPIHelper(object): def get_hostname(self): import socket return socket.gethostname() + + diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 7c9a76efb69..b96a15a32fd 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -12,7 +12,6 @@ class Worker(object): class DownpourServer(Server): def __init__(self): - #self.server_ = pslib.ServerParameter().downpour_server_param self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 355841aba8f..0ef34d6e189 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xe4\x01\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x02 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x03 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x04 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x05 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3140, - serialized_end=3192, + serialized_start=3198, + serialized_end=3250, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3195, - serialized_end=3512, + serialized_start=3253, + serialized_end=3570, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3108, - serialized_end=3138, + serialized_start=3166, + serialized_end=3196, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -197,7 +197,14 @@ _PSPARAMETER = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( - name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=5, + name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=5, + number=301, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=6, number=501, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, @@ -216,7 +223,7 @@ _PSPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=21, - serialized_end=249, + serialized_end=307, ) @@ -246,8 +253,8 @@ _WORKERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=251, - serialized_end=332, + serialized_start=309, + serialized_end=390, ) @@ -277,8 +284,8 @@ _SERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=334, - serialized_end=415, + serialized_start=392, + serialized_end=473, ) @@ -308,8 +315,8 @@ _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=417, - serialized_end=496, + serialized_start=475, + serialized_end=554, ) @@ -322,28 +329,28 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( fields=[ _descriptor.FieldDescriptor( name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, - number=2, type=11, cpp_type=10, label=3, + number=1, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, - number=3, type=11, cpp_type=10, label=3, + number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, - number=4, type=5, cpp_type=1, label=1, + number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, - number=5, type=5, cpp_type=1, label=1, + number=4, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -360,8 +367,8 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=499, - serialized_end=687, + serialized_start=557, + serialized_end=745, ) @@ -412,8 +419,8 @@ _DENSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=689, - serialized_end=812, + serialized_start=747, + serialized_end=870, ) @@ -471,8 +478,8 @@ _SPARSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=814, - serialized_end=936, + serialized_start=872, + serialized_end=994, ) @@ -509,8 +516,8 @@ _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=939, - serialized_end=1073, + serialized_start=997, + serialized_end=1131, ) @@ -568,8 +575,8 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1076, - serialized_end=1221, + serialized_start=1134, + serialized_end=1279, ) @@ -634,8 +641,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1224, - serialized_end=1415, + serialized_start=1282, + serialized_end=1473, ) @@ -714,8 +721,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1418, - serialized_end=1787, + serialized_start=1476, + serialized_end=1845, ) @@ -787,8 +794,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1790, - serialized_end=1996, + serialized_start=1848, + serialized_end=2054, ) @@ -832,8 +839,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1998, - serialized_end=2081, + serialized_start=2056, + serialized_end=2139, ) @@ -891,8 +898,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2083, - serialized_end=2184, + serialized_start=2141, + serialized_end=2242, ) @@ -943,8 +950,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2186, - serialized_end=2305, + serialized_start=2244, + serialized_end=2363, ) @@ -1002,8 +1009,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2308, - serialized_end=2533, + serialized_start=2366, + serialized_end=2591, ) @@ -1061,8 +1068,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2536, - serialized_end=2670, + serialized_start=2594, + serialized_end=2728, ) @@ -1099,8 +1106,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2672, - serialized_end=2738, + serialized_start=2730, + serialized_end=2796, ) @@ -1130,8 +1137,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2740, - serialized_end=2799, + serialized_start=2798, + serialized_end=2857, ) @@ -1161,8 +1168,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2801, - serialized_end=2847, + serialized_start=2859, + serialized_end=2905, ) @@ -1206,8 +1213,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2849, - serialized_end=2922, + serialized_start=2907, + serialized_end=2980, ) @@ -1280,12 +1287,13 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2925, - serialized_end=3138, + serialized_start=2983, + serialized_end=3196, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER _PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER _WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER _SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER -- GitLab From 77236e33fc0d8b72890b700abbe47c838989baaf Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 26 Nov 2018 05:41:51 +0000 Subject: [PATCH 0046/2367] init jitkernel --- paddle/fluid/operators/CMakeLists.txt | 1 + .../fluid/operators/jitkernels/CMakeLists.txt | 17 +++ paddle/fluid/operators/jitkernels/README.md | 1 + .../jitkernels/jitcode/CMakeLists.txt | 3 + .../operators/jitkernels/jitcode/jitcode.cc | 15 ++ .../operators/jitkernels/jitcode/jitcode.h | 54 +++++++ .../operators/jitkernels/jitcode_base.cc | 40 +++++ .../fluid/operators/jitkernels/jitcode_base.h | 73 +++++++++ .../fluid/operators/jitkernels/kernel_base.h | 47 ++++++ .../fluid/operators/jitkernels/kernel_key.h | 49 ++++++ paddle/fluid/operators/jitkernels/kernels.cc | 33 ++++ paddle/fluid/operators/jitkernels/kernels.h | 142 ++++++++++++++++++ .../operators/jitkernels/more/CMakeLists.txt | 7 + .../jitkernels/more/mkl/CMakeLists.txt | 3 + .../operators/jitkernels/more/mkl/mkl.cc | 44 ++++++ .../fluid/operators/jitkernels/more/mkl/mkl.h | 55 +++++++ paddle/fluid/operators/jitkernels/more/more.h | 15 ++ .../operators/jitkernels/refer/CMakeLists.txt | 3 + .../fluid/operators/jitkernels/refer/refer.cc | 20 +++ .../fluid/operators/jitkernels/refer/refer.h | 33 ++++ paddle/fluid/operators/jitkernels/registry.h | 134 +++++++++++++++++ paddle/fluid/operators/jitkernels/test.cc | 36 +++++ paddle/fluid/operators/math/CMakeLists.txt | 16 +- paddle/fluid/platform/cpu_info.h | 2 +- 24 files changed, 834 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/operators/jitkernels/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/README.md create mode 100644 paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/jitcode/jitcode.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode/jitcode.h create mode 100644 paddle/fluid/operators/jitkernels/jitcode_base.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode_base.h create mode 100644 paddle/fluid/operators/jitkernels/kernel_base.h create mode 100644 paddle/fluid/operators/jitkernels/kernel_key.h create mode 100644 paddle/fluid/operators/jitkernels/kernels.cc create mode 100644 paddle/fluid/operators/jitkernels/kernels.h create mode 100644 paddle/fluid/operators/jitkernels/more/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/mkl.cc create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/mkl.h create mode 100644 paddle/fluid/operators/jitkernels/more/more.h create mode 100644 paddle/fluid/operators/jitkernels/refer/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/refer/refer.cc create mode 100644 paddle/fluid/operators/jitkernels/refer/refer.h create mode 100644 paddle/fluid/operators/jitkernels/registry.h create mode 100644 paddle/fluid/operators/jitkernels/test.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8c8dc7026e1..3458df16062 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(jitkernels) if(WITH_DISTRIBUTE) add_subdirectory(distributed) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt new file mode 100644 index 00000000000..f073210542a --- /dev/null +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -0,0 +1,17 @@ + +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) + +cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) + +add_subdirectory(more) +add_subdirectory(refer) + +if(WITH_XBYAK) + add_subdirectory(jitcode) +endif() + +# Debug +message(STATUS "--------${JIT_KERNEL_DEPS}") + +cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md new file mode 100644 index 00000000000..a0990367ef8 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/README.md @@ -0,0 +1 @@ +TBD diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt new file mode 100644 index 00000000000..1a5e457309e --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc new file mode 100644 index 00000000000..0dd2d049d2a --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h new file mode 100644 index 00000000000..c1004447664 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernels.h" + +#define XBYAK_USE_MMAP_ALLOCATOR +#include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +// Application Binary Interface +constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), + abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), + abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + +template +class JitCode : public JitBase, public Xbyak::CodeGenerator { + public: + JitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + : Xbyak::CodeGenerator(code_size, code_ptr) { + this->genCode(); + } + + virtual const char* name() const = 0; + virtual void genCode() = 0; + + const unsigned char* getCodeInternal() override { + const Xbyak::uint8* code = CodeGenerator::getCode(); + return code; + } +}; + +} // namespace jitcode +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jitkernels/jitcode_base.cc new file mode 100644 index 00000000000..417c4d4b9e2 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode_base.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" + +DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); + +namespace paddle { +namespace operators { +namespace jitkernels { + +// refer do not need useme, it would be the last one. +void JitBase::dumpCode(const unsigned char* code) const { + if (code) { + static int counter = 0; + std::ostringstream filename; + filename << "paddle_jitcode_" << name() << "." << counter << ".bin"; + counter++; + std::ofstream fout(filename.str(), std::ios::out); + if (fout.is_open()) { + fout.write(reinterpret_cast(code), getSize()); + fout.close(); + } + } +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h new file mode 100644 index 00000000000..0cd6d3c7416 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/macros.h" + +DECLARE_bool(dump_jitcode); + +namespace paddle { +namespace operators { +namespace jitkernels { + +// TODO(TJ): make these functions as virtual of a class + +// Every JitCode should estimate the code size itself +template +size_t CodeSize(Attr attr) { + return 4096; +} + +// Every JitCode should have a condition when to use this JitCode +template +bool UseJitCode(Attr attr) { + return false; +} + +// Every JitCode should have a method to get the key from attribution +template +size_t GetKey(Attr attr); + +template <> +size_t GetKey(int d) { + return d; +} + +class JitBase { + public: + JitBase() = default; + virtual ~JitBase() = default; + virtual const char* name() const = 0; + virtual const unsigned char* getCodeInternal() = 0; + + template + const FUNC getCode() { + const unsigned char* code = this->getCodeInternal(); + if (FLAGS_dump_jitcode) { + this->dumpCode(code); + } + return reinterpret_cast(code); + } + DISABLE_COPY_AND_ASSIGN(JitBase); + + protected: + void dumpCode(const unsigned char* code); +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h new file mode 100644 index 00000000000..bd95a921c57 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; + +// Just for adding to kernel pool without template +class Kernel { + public: + Kernel() = default; + DISABLE_COPY_AND_ASSIGN(Kernel); +}; + +template // TODO(TJ): use tuple +class KernelImpl : public Kernel { + public: + using ELEMENT_TYPE = T; // TODO(TJ): remove me? + KernelImpl() = default; + virtual ~KernelImpl() = default; + + virtual Func GetFunc() { return func; } + virtual bool UseMe(Attr attr) const = 0; + + protected: + Func func{nullptr}; +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_key.h b/paddle/fluid/operators/jitkernels/kernel_key.h new file mode 100644 index 00000000000..e06c2b58dae --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernel_key.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +struct KernelKey { + struct Hash { + size_t operator()(const KernelKey& key) const { + int place = key.place_.which(); // less than 2^8 + int type = static_cast(key.type_) << 8; // less than 2^(32-8) + std::hash hasher; + return hasher(place + type); + } + }; + + KernelType type_; + platform::Place place_; + + KernelKey(KernelType type, platform::Place place) + : type_(type), place_(place) {} + size_t hash_key() const { return Hash()(*this); } + + bool operator==(const KernelKey& o) const { + return platform::places_are_same_class(place_, o.place_) && + type_ == o.type_; + } + bool operator!=(const KernelKey& o) const { return !(*this == o); } +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernels.cc new file mode 100644 index 00000000000..76f49514ee1 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernels.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/kernels.h" +#include // for shared_ptr +#include +#include + +namespace paddle { +namespace operators { +namespace jitkernels { + +// refer do not need useme, it would be the last one. + +KernelPool& KernelPool::Instance() { + static KernelPool g_kernel_pool; + return g_kernel_pool; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h new file mode 100644 index 00000000000..2792b897d36 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for shared_ptr +#include +#include +#include + +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jitkernels/kernel_key.h" + +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#endif + +namespace paddle { +namespace operators { +namespace jitkernels { + +template +class JitCodePool { + public: + static JitCodePool& Instance() { + static thread_local JitCodePool g_jit_codes; + return g_jit_codes; + } + + std::shared_ptr Get(size_t key) const { + if (codes_.find(key) == codes_.end()) { + return nullptr; + } + return codes_.at(key); + } + + void Insert(size_t key, const std::shared_ptr& value) { + codes_.insert({key, value}); + } + + private: + JitCodePool() = default; + std::unordered_map> codes_; + + DISABLE_COPY_AND_ASSIGN(JitCodePool); +}; + +// std::tuple +template +struct KernelAttr { + typedef T data_type; + typedef Func return_type; + typedef Attr attr_type; +}; + +class KernelPool { + public: + static KernelPool& Instance(); + + typedef std::unique_ptr KernelPtr; + typedef std::unordered_map, KernelKey::Hash> + KernelMap; + KernelMap& AllKernels() { return pool_; } + + void Insert(const KernelKey& key, KernelPtr value) { + if (pool_.find(key) == pool_.end()) { + pool_.emplace(key, std::vector()); + } + pool_.at(key).emplace_back(std::move(value)); + } + KernelPool() = default; + + private: + KernelMap pool_; + + DISABLE_COPY_AND_ASSIGN(KernelPool); +}; + +// TODO(TJ): create_jitcode; + +// TODO(TJ): make tuple? named KernelAttr +template +Func Get(Attr attr) { + size_t key = GetKey(attr); + auto jitcode = JitCodePool().Instance().Get(key); + if (jitcode) { + return jitcode->template getCode(); + } + +#ifdef PADDLE_WITH_XBYAK +// // jitcode::JitCode is under protection of PADDLE_WITH_XBYAK +// if (std::is_same::value) { +// if (UseJitCode(attr)) { +// std::shared_ptr p(std::make_shared>( +// attr, CodeSize(attr))); +// JitCodePool().Instance().Insert(key, p); +// return p->getCode(); +// } +// } +#endif + + // (KernelKey(type, place), vector) + auto& pool = KernelPool().Instance().AllKernels(); + KernelKey kkey(KT, PlaceType()); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto impls = iter->second; + for (auto impl : impls) { + auto i = std::dynamic_pointer_cast>(impl.get()); + if (i && i->UseMe(attr)) { + return i->GetFunc(); + } + } + } + + // The last implementation should be reference function on CPU + // Every kernel should have refer code. + + // because of test refer should have it's own pool + // PADDLE_ENFORCE_GT(list.size(), 1) << "Should have refer implemtation"; + // const auto& refer = KernelRefer().AllKernels(); + // return refer.Get(); + + return nullptr; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/CMakeLists.txt b/paddle/fluid/operators/jitkernels/more/CMakeLists.txt new file mode 100644 index 00000000000..84f1811ced2 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/CMakeLists.txt @@ -0,0 +1,7 @@ + + +if(WITH_MKLML) + add_subdirectory(mkl) +endif() + +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt new file mode 100644 index 00000000000..94d2487866b --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc b/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc new file mode 100644 index 00000000000..88a7d661940 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/more/mkl/mkl.h" +#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace more { +namespace mkl { + +template <> +void VMul(const float* x, const float* y, float* z, int n) { + platform::dynload::vsMul(n, x, y, z); +} + +template <> +void VMul(const double* x, const double* y, double* z, int n) { + platform::dynload::vdMul(n, x, y, z); +} + +} // namespace mkl +} // namespace more +} // namespace jitkernels +} // namespace operators +} // namespace paddle + +namespace mkl = paddle::operators::jitkernels::more::mkl; + +REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, + mkl::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h new file mode 100644 index 00000000000..7cb4334e503 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace more { +namespace mkl { + +template +void VMul(const T* x, const T* y, T* z, int n); + +// template +// struct VMulTypes{ +// typedef T date_type; +// typedef void (*func)(const T*, const T*, T*, int) func_type; +// typedef int attr_type; +// }; + +template +class VMulKernel + : public KernelImpl { + public: + VMulKernel() { this->func = VMul; } + bool UseMe(int d) const override { + if (std::is_same::value) { + return platform::jit::MayIUse(platform::jit::avx512f) && d > 512; + } else { + return true; + } + } +}; + +} // namespace mkl +} // namespace more +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/more.h b/paddle/fluid/operators/jitkernels/more/more.h new file mode 100644 index 00000000000..ab99fdc05f9 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/more.h @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once diff --git a/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt b/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt new file mode 100644 index 00000000000..8c116e42dc6 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jitkernels/refer/refer.cc new file mode 100644 index 00000000000..1f6d384fc2d --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/refer.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/refer/refer.h" +#include "paddle/fluid/operators/jitkernels/registry.h" + +namespace refer = paddle::operators::jitkernels::refer; + +// REGISTER_JITKERNEL_REFER(vmul, refer::VMul, refer::VMul); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h new file mode 100644 index 00000000000..be55c30b1ed --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace refer { + +template +void VMul(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +} // namespace refer +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h new file mode 100644 index 00000000000..1d2d47a8047 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +// make_unique is supported from c++14 +template +inline std::unique_ptr make_unique(Args&&... args) { + static_assert(!std::is_array::value, "T must not be array"); + return std::unique_ptr(new T(std::forward(args)...)); +} + +template +struct JitKernelRegistrarFunctor; + +template +struct JitKernelRegistrarFunctor { + void operator()(KernelType kt) const {} +}; + +template +struct JitKernelRegistrarFunctor { + using KERNEL_IMPL_TYPE = + typename std::tuple_element>::type; + + void operator()(KernelType kt) const { + KernelKey kkey(kt, PlaceType()); + KernelPool().Instance().Insert( + kkey, std::move(make_unique())); + constexpr auto size = std::tuple_size>::value; + JitKernelRegistrarFunctor + func; + func(kt); + } +}; + +template +class JitKernelRegistrar { + public: + explicit JitKernelRegistrar(KernelType kt) { + JitKernelRegistrarFunctor func; + func(kt); + } +}; + +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// kernel_type: should be in paddle::operators::jitkernels::KernelType +// place_type: should be one of CPUPlace and GPUPlace in paddle::platform +#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE must be called in global namespace"); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::platform::place_type, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##__( \ + ::paddle::operators::jitkernels::KernelType::kernel_type) +// TODO(TJ): Add Touch and use me + +#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) + +#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) + +/* +REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); + +// refer must be only one and at least one +REGISTER_JITKERNEL_REFER(vmul, VMul); // Refer need support dtype + +// you can register more implementations and the condition when use it +REGISTER_JITKERNEL_MORE(vmul, mkl::VMUL, UseMe, mkl::VMUL, + UseMe) + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new pass that can be applied on the IR. +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar& \ + __pass_tmp_registrar_##pass_type##__ UNUSED = \ + __pass_registrar_##pass_type##__ + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ UNUSED = \ + TouchPassRegistrar_##pass_type() +*/ + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc new file mode 100644 index 00000000000..86c6669173c --- /dev/null +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include // for memcpy +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" +#include "paddle/fluid/platform/port.h" + +constexpr int repeat = 20000; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +TEST(JitKernel, vmul) {} + +TEST(JitKernel, pool) {} diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 63363086adb..db91628d94e 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -73,11 +73,11 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) -if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) -endif() -cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) -cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +# set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) +# set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +# if(WITH_XBYAK) +# list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) +# list(APPEND JIT_KERNEL_DEPS xbyak) +# endif() +# cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +# cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b46..663a9fbf4e1 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { +namespace jit { // remove this namespace typedef enum { isa_any, sse42, -- GitLab From 45bfa70cb8a8123cfa5c32ec7323d616f9192e3d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 3 Dec 2018 12:15:16 +0000 Subject: [PATCH 0047/2367] complete vmul jit kernel --- .../fluid/operators/jitkernels/CMakeLists.txt | 12 +- paddle/fluid/operators/jitkernels/README.md | 3 + .../operators/jitkernels/jitcode/jitcode.cc | 23 ++++ .../operators/jitkernels/jitcode/jitcode.h | 7 +- .../fluid/operators/jitkernels/jitcode_base.h | 9 +- .../fluid/operators/jitkernels/kernel_base.h | 13 +- paddle/fluid/operators/jitkernels/kernels.cc | 7 +- paddle/fluid/operators/jitkernels/kernels.h | 110 ++++++++------- .../fluid/operators/jitkernels/refer/refer.cc | 3 +- .../fluid/operators/jitkernels/refer/refer.h | 8 ++ paddle/fluid/operators/jitkernels/registry.h | 126 ++++++++++-------- paddle/fluid/operators/jitkernels/test.cc | 78 ++++++++++- 12 files changed, 273 insertions(+), 126 deletions(-) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index f073210542a..6392d82e16d 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -1,17 +1,19 @@ +# set(use_jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) +# file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +# file(APPEND ${pass_file} "\#pragma once\n") +# file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") + + set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) -add_subdirectory(more) add_subdirectory(refer) - +add_subdirectory(more) if(WITH_XBYAK) add_subdirectory(jitcode) endif() -# Debug -message(STATUS "--------${JIT_KERNEL_DEPS}") - cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md index a0990367ef8..3401e9be531 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jitkernels/README.md @@ -1 +1,4 @@ TBD + +# Use me +Add USE_JIT_KERNEL(yourname) to CMakefile. diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc index 0dd2d049d2a..8078ace7a84 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc @@ -13,3 +13,26 @@ * limitations under the License. */ #include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +template <> +size_t GetKey(int d) { + return d; +} + +// template <> +// std::shared_ptr CreateJitCode(int attr) +// { +// if (UseJitCode(attr)) { +// return std::make_shared>(attr, +// CodeSize(attr))); +// } +// return nullptr; +// } + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index c1004447664..7e0b6442edd 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -15,6 +15,7 @@ #pragma once #include +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" #include "paddle/fluid/operators/jitkernels/kernels.h" #define XBYAK_USE_MMAP_ALLOCATOR @@ -31,10 +32,10 @@ constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); -template -class JitCode : public JitBase, public Xbyak::CodeGenerator { +template +class VMulJitCode : public JitBase, public Xbyak::CodeGenerator { public: - JitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { this->genCode(); } diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index 0cd6d3c7416..a164746561e 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -15,6 +15,7 @@ #pragma once #include +#include // for shared_ptr #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/platform/macros.h" @@ -42,11 +43,6 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -template <> -size_t GetKey(int d) { - return d; -} - class JitBase { public: JitBase() = default; @@ -68,6 +64,9 @@ class JitBase { void dumpCode(const unsigned char* code); }; +template +std::shared_ptr CreateJitCode(Attr attr); + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h index bd95a921c57..eeaa0617cb8 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -25,6 +25,7 @@ typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; class Kernel { public: Kernel() = default; + virtual ~Kernel() = default; DISABLE_COPY_AND_ASSIGN(Kernel); }; @@ -32,16 +33,20 @@ template // TODO(TJ): use tuple class KernelImpl : public Kernel { public: using ELEMENT_TYPE = T; // TODO(TJ): remove me? - KernelImpl() = default; - virtual ~KernelImpl() = default; - - virtual Func GetFunc() { return func; } + virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; protected: Func func{nullptr}; }; +template // TODO(TJ): use tuple +class ReferKernel : public KernelImpl { + public: + // Refer code can always be used + bool UseMe(Attr attr) const override { return true; } +}; + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernels.cc index 76f49514ee1..35095220e39 100644 --- a/paddle/fluid/operators/jitkernels/kernels.cc +++ b/paddle/fluid/operators/jitkernels/kernels.cc @@ -21,13 +21,16 @@ namespace paddle { namespace operators { namespace jitkernels { -// refer do not need useme, it would be the last one. - KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; return g_kernel_pool; } +ReferKernelPool& ReferKernelPool::Instance() { + static ReferKernelPool g_refer_kernel_pool; + return g_refer_kernel_pool; +} + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h index 2792b897d36..866f72cce04 100644 --- a/paddle/fluid/operators/jitkernels/kernels.h +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -18,22 +18,21 @@ #include #include #include - #include "paddle/fluid/operators/jitkernels/jitcode_base.h" #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/operators/jitkernels/kernel_key.h" - -#ifdef PADDLE_WITH_XBYAK -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" -#endif +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { namespace jitkernels { +// TODO(TJ): rename file to kernel_pool + template class JitCodePool { public: + JitCodePool() = default; static JitCodePool& Instance() { static thread_local JitCodePool g_jit_codes; return g_jit_codes; @@ -51,13 +50,11 @@ class JitCodePool { } private: - JitCodePool() = default; std::unordered_map> codes_; - DISABLE_COPY_AND_ASSIGN(JitCodePool); }; -// std::tuple +// TODO(TJ): std::tuple template struct KernelAttr { typedef T data_type; @@ -65,76 +62,99 @@ struct KernelAttr { typedef Attr attr_type; }; +typedef std::unique_ptr KernelPtr; +typedef std::unordered_map, KernelKey::Hash> + KernelMap; + class KernelPool { public: static KernelPool& Instance(); - - typedef std::unique_ptr KernelPtr; - typedef std::unordered_map, KernelKey::Hash> - KernelMap; + KernelPool() = default; KernelMap& AllKernels() { return pool_; } - void Insert(const KernelKey& key, KernelPtr value) { if (pool_.find(key) == pool_.end()) { pool_.emplace(key, std::vector()); } pool_.at(key).emplace_back(std::move(value)); } - KernelPool() = default; private: KernelMap pool_; - DISABLE_COPY_AND_ASSIGN(KernelPool); }; -// TODO(TJ): create_jitcode; +// Every kernel should have refer code and it should be used in unit tests, +// so refer kernels should have it's independent kernel pool +class ReferKernelPool { + public: + static ReferKernelPool& Instance(); + ReferKernelPool() = default; + KernelMap& AllKernels() { return pool_; } + void Insert(const KernelKey& key, KernelPtr value) { + if (pool_.find(key) == pool_.end()) { + pool_.emplace(key, std::vector()); + } + pool_.at(key).emplace_back(std::move(value)); + } + + private: + KernelMap pool_; + DISABLE_COPY_AND_ASSIGN(ReferKernelPool); +}; + +// Refer code do not related with attr, and always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} // TODO(TJ): make tuple? named KernelAttr template Func Get(Attr attr) { - size_t key = GetKey(attr); - auto jitcode = JitCodePool().Instance().Get(key); - if (jitcode) { - return jitcode->template getCode(); + // size_t key = GetKey(attr); + // auto jitcode = JitCodePool().Instance().Get(key); + // if (jitcode) { + // return jitcode->template getCode(); + // } + + if (std::is_same::value && + std::is_same::value) { // TODO(TJ): float move to create + // auto p = CreateJitCode(attr); + // if (p) { + // JitCodePool().Instance().Insert(key, p); + // return p->template getCode(); + // } } -#ifdef PADDLE_WITH_XBYAK -// // jitcode::JitCode is under protection of PADDLE_WITH_XBYAK -// if (std::is_same::value) { -// if (UseJitCode(attr)) { -// std::shared_ptr p(std::make_shared>( -// attr, CodeSize(attr))); -// JitCodePool().Instance().Insert(key, p); -// return p->getCode(); -// } -// } -#endif - - // (KernelKey(type, place), vector) + // pool: (KernelKey(type, place), vector) auto& pool = KernelPool().Instance().AllKernels(); KernelKey kkey(KT, PlaceType()); auto iter = pool.find(kkey); if (iter != pool.end()) { - auto impls = iter->second; - for (auto impl : impls) { - auto i = std::dynamic_pointer_cast>(impl.get()); + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } } } - // The last implementation should be reference function on CPU - // Every kernel should have refer code. - - // because of test refer should have it's own pool - // PADDLE_ENFORCE_GT(list.size(), 1) << "Should have refer implemtation"; - // const auto& refer = KernelRefer().AllKernels(); - // return refer.Get(); - - return nullptr; + // The last implementation should be reference function on CPUPlace. + return GetRefer(); } } // namespace jitkernels diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jitkernels/refer/refer.cc index 1f6d384fc2d..dbccac896c5 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.cc +++ b/paddle/fluid/operators/jitkernels/refer/refer.cc @@ -17,4 +17,5 @@ namespace refer = paddle::operators::jitkernels::refer; -// REGISTER_JITKERNEL_REFER(vmul, refer::VMul, refer::VMul); +REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, + refer::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h index be55c30b1ed..163c6d73dce 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -27,6 +28,13 @@ void VMul(const T* x, const T* y, T* z, int n) { } } +template +class VMulKernel + : public ReferKernel { + public: + VMulKernel() { this->func = VMul; } +}; + } // namespace refer } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index 1d2d47a8047..62a0de36410 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -20,6 +20,7 @@ #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/operators/jitkernels/kernels.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" // for UNUSED namespace paddle { namespace operators { @@ -32,37 +33,40 @@ inline std::unique_ptr make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } -template +template struct JitKernelRegistrarFunctor; -template -struct JitKernelRegistrarFunctor { +template +struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const {} }; -template -struct JitKernelRegistrarFunctor { +template +struct JitKernelRegistrarFunctor { using KERNEL_IMPL_TYPE = typename std::tuple_element>::type; void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - KernelPool().Instance().Insert( - kkey, std::move(make_unique())); + Pool().Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; - JitKernelRegistrarFunctor + JitKernelRegistrarFunctor func; func(kt); } }; -template +template class JitKernelRegistrar { public: explicit JitKernelRegistrar(KernelType kt) { - JitKernelRegistrarFunctor func; + JitKernelRegistrarFunctor func; func(kt); } + void Touch() {} }; #define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ @@ -71,17 +75,40 @@ class JitKernelRegistrar { __test_global_namespace_##uniq_name##__>::value, \ msg) +// Refer always on CPUPlace +#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ + "REGISTER_KERNEL_REFER must be called in global namespace"); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::operators::jitkernels::ReferKernelPool, \ + ::paddle::platform::CPUPlace, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ + ::paddle::operators::jitkernels::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ + return 0; \ + } + // kernel_type: should be in paddle::operators::jitkernels::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform -#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ - "REGISTER_KERNEL_MORE must be called in global namespace"); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::platform::place_type, __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##__( \ - ::paddle::operators::jitkernels::KernelType::kernel_type) -// TODO(TJ): Add Touch and use me +#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ + UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::operators::jitkernels::KernelPool, \ + ::paddle::platform::place_type, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ + ::paddle::operators::jitkernels::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ + .Touch(); \ + return 0; \ + } #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) @@ -89,45 +116,28 @@ class JitKernelRegistrar { #define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) -/* -REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); - -// refer must be only one and at least one -REGISTER_JITKERNEL_REFER(vmul, VMul); // Refer need support dtype - -// you can register more implementations and the condition when use it -REGISTER_JITKERNEL_MORE(vmul, mkl::VMUL, UseMe, mkl::VMUL, - UseMe) - -#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -// Register a new pass that can be applied on the IR. -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } \ - static ::paddle::framework::ir::PassRegistrar& \ - __pass_tmp_registrar_##pass_type##__ UNUSED = \ - __pass_registrar_##pass_type##__ - -#define USE_PASS(pass_type) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __use_pass_itself_##pass_type, \ - "USE_PASS must be called in global namespace"); \ - extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ UNUSED = \ - TouchPassRegistrar_##pass_type() -*/ +// REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); + +#define USE_JITKERNEL_REFER(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace_, \ + "USE_JITKERNEL_REFER must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ + TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() + +#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \ + "USE_JITKERNEL_MORE must be called in global namespace"); \ + extern int \ + TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ + static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \ + UNUSED = \ + TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() + +#define USE_JITKERNEL_MORE(kernel_type, impl_type) \ + USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index 86c6669173c..d11c7afe9ab 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -19,8 +19,11 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/math/jit_kernel.h" -#include "paddle/fluid/operators/math/jit_kernel_refer.h" +#include "paddle/fluid/operators/jitkernels/kernels.h" +// TODO(TJ): remove me +#include "paddle/fluid/operators/jitkernels/registry.h" + +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" constexpr int repeat = 20000; @@ -31,6 +34,75 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } -TEST(JitKernel, vmul) {} +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +template +void ExpectEQ(const T* target, const T* refer, int n) { + if (std::is_floating_point::value) { + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(target[i], refer[i], 1e-3); + } + } else { + for (int i = 0; i < n; ++i) { + EXPECT_EQ(target[i], refer[i]); + } + } +} + +// TODO(TJ): remove me +USE_JITKERNEL_MORE(vmul, mkl); +USE_JITKERNEL_REFER(vmul); + +TEST(JitKernel, vmul) { + using T = float; + using PlaceType = paddle::platform::CPUPlace; + + namespace jit = paddle::operators::jitkernels; + // TODO(TJ): test more vector size + for (int d = 1; d < 30; ++d) { + auto ref = jit::GetRefer(); + auto tgt = jit::Get(d); + EXPECT_TRUE(ref != nullptr); + EXPECT_TRUE(tgt != nullptr); + + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + + tgt(x_data, y_data, ztgt_data, d); + ref(x_data, y_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + + // test inplace x + std::copy(x.begin(), x.end(), zref.begin()); + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ref(zref_data, y_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + + // test inplace y + std::copy(y.begin(), y.end(), zref.begin()); + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ref(x_data, zref_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + } +} TEST(JitKernel, pool) {} -- GitLab From 9af76ade4c93c60faa7a92f0e720721c6f8c1cc5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 5 Dec 2018 17:58:42 +0800 Subject: [PATCH 0048/2367] fix unused var --- paddle/fluid/operators/reader/ctr_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 65b300d152f..ca9a58615e0 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -258,7 +258,7 @@ static inline void parse_csv_line( int slot_idx = data_desc.sparse_slot_index_[i]; auto& slot_data = ret[slot_idx]; std::vector data_in_slot_str; - string_split(ret[slot_idx], ',', &data_in_slot_str); + string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { (*sparse_datas)[i].push_back(std::stol(data_str)); -- GitLab From d3ca359e445884ffca4b147607607517aad4791b Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 5 Dec 2018 19:30:37 +0800 Subject: [PATCH 0049/2367] config init & adapt to interface --- paddle/fluid/framework/async_executor.cc | 55 +++++++++++++++++-- paddle/fluid/framework/async_executor.h | 3 +- .../fluid/framework/executor_thread_worker.cc | 44 ++++++++------- .../fluid/framework/executor_thread_worker.h | 15 +++-- 4 files changed, 85 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 94ed8c2fca4..292b05c5884 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -67,21 +67,63 @@ void PrepareReaders(std::vector>& readers, // NOLINT void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO + _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO done } void AsyncExecutor::StartServer() { + InitParamConfig(); _pslib_ptr->run_server(); } +void AsyncExecutor::InitParamConfig() { + _param_config.fea_dim = _pslib_ptr->get_param()->trainer_param().sparse_table(0).feature_dim(); //TODO + _param_config.slot_dim = _param_config.fea_dim - 2; //TODO + _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + //sparse + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_value(i)] = table.table_id(); + } + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back( + table.slot_gradient(i)); + } + _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + //dense + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); + } + _param_config.dense_variable_name[table.table_id()] = std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); //TODO + } +} + void AsyncExecutor::InitModel() { //TODO only rank = 0 do this - std::vector all_dense_table_id; //TODO - all_dense_table_id.push_back(0); - for (auto table_id: all_dense_table_id) { + //std::vector all_dense_table_id; //TODO + //all_dense_table_id.push_back(0); //done + for (auto table_id: _param_config.dense_table_id) { std::vector regions; - std::vector variables; //TODO - for (auto& t : variables) { + //std::vector variables; //TODO + for (auto& t : _param_config.dense_variable_name[table_id]) { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -131,6 +173,7 @@ void AsyncExecutor::PrepareDenseThread() { param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + param.dense_params = &_param_config.dense_variable_name; _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 67f4e5deeee..21e4a66fcef 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -68,7 +68,7 @@ class AsyncExecutor { void StartServer(); void InitModel(); void SaveModel(const std::string& path); - + void InitParamConfig(); private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -86,6 +86,7 @@ class AsyncExecutor { AsyncWorkerParamConfig _param_config; private: int actual_thread_num; + }; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 19d8818be74..f7c05e400d7 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -382,33 +382,38 @@ void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { } */ } -void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* pc) { - _param_config = pc; + +void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { + _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - int table_id = 0; //TODO - PullSparse(table_id); - for (auto& t : _pull_sparse_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(ERROR) << "pull sparse failed, status[" << status << "]"; - exit(-1); + //int table_id = 0; //TODO + for (auto table_id: _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } } } _pull_sparse_status.resize(0); - FillSparse(table_id); + for (auto table_id: _param_config->sparse_table_id) { + FillSparse(table_id); + } } void AsyncExecutorThreadWorker::UpdateParams() { - //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (int i = 0; i < 1; ++i) { + for (auto i: _param_config->sparse_table_id) {//TODO + //for (int i = 0; i < 1; ++i) { PushSparse(i); } //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (int i = 1; i < 2; ++i) { + for (auto i: _param_config->dense_table_id) { PushDense(i); } int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO @@ -437,14 +442,13 @@ void AsyncExecutorThreadWorker::UpdateParams() { } //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO - int dense_table_id = 1; + for (auto dense_table_id: _param_config->dense_table_id) { _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } //} } void AsyncExecutorThreadWorker::PushDense(int table_id) { - //auto table_id = GlobalConfig::instance().dense_table_id[table_id_index]; TODO - std::vector regions; //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; std::vector variables; @@ -529,7 +533,7 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { int64_t* ids = tensor->data(); int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[slot_idx - 1]); + Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); float* ptr = tensor_emb->data(); @@ -575,10 +579,10 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { continue; } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[slot_idx - 1]); + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); LoDTensor* g_tensor = g_var->GetMutable(); //int count = g_tensor->numel(); float* g = g_tensor->data(); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 63f383cd479..4e3255a590c 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -40,8 +40,14 @@ struct AsyncWorkerParamConfig { int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; - std::vector slot_input_vec; //6048slot 6050slot //name - std::vector gradient_var; //6048slot_embed + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + std::vector dense_table_size; // fea_dim for each dense table + std::vector sparse_table_id; + std::map> slot_input_vec; //6048slot 6050slot //name + std::map> gradient_var; //6048slot_embed + std::unordered_map slot_alias_to_table; //TODO done }; struct DensePullThreadParam { @@ -148,7 +154,7 @@ class ExecutorThreadWorker { virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); virtual void SetPullDenseThread(std::shared_ptr dpt) {}; virtual void BindingSlotVariableMemory() {}; - virtual void SetParamConfig(AsyncWorkerParamConfig* pc) {}; + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -184,7 +190,7 @@ public: void SetPSlibPtr(std::shared_ptr pslib_ptr); void SetPullDenseThread(std::shared_ptr dpt); void BindingSlotVariableMemory(); - void SetParamConfig(AsyncWorkerParamConfig* pc); + void SetParamConfig(AsyncWorkerParamConfig* param_config); void TrainFiles(); void TrainOneNetwork(); void PrepareParams(); @@ -209,7 +215,6 @@ private: std::map>> _feature_value; std::map>> _feature_push_value; - std::unordered_map _slot_alias_to_table; //TODO std::shared_ptr _pslib_ptr; -- GitLab From 191948c933a15c41315228ffbdc70eb59fdb8f55 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 07:15:08 +0000 Subject: [PATCH 0050/2367] enable jitcode --- .../fluid/operators/jitkernels/CMakeLists.txt | 2 +- .../jitkernels/jitcode/CMakeLists.txt | 4 +- .../operators/jitkernels/jitcode/blas.cc | 118 ++++++++++++++++++ .../fluid/operators/jitkernels/jitcode/blas.h | 88 +++++++++++++ .../operators/jitkernels/jitcode/jitcode.h | 95 ++++++++++++-- .../operators/jitkernels/jitcode_base.cc | 5 +- .../fluid/operators/jitkernels/jitcode_base.h | 19 +-- paddle/fluid/operators/jitkernels/kernels.h | 59 ++++----- paddle/fluid/platform/cpu_info.h | 2 +- 9 files changed, 342 insertions(+), 50 deletions(-) create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.h diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index 6392d82e16d..e82e6c3026f 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -7,7 +7,7 @@ set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) -cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt index 1a5e457309e..c678ea33b8e 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt @@ -1,3 +1,5 @@ -cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak) +file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") + +cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.cc b/paddle/fluid/operators/jitkernels/jitcode/blas.cc new file mode 100644 index 00000000000..2691bee0fdf --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/blas.cc @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ +#include "paddle/fluid/operators/jitkernels/jitcode/blas.h" +#include "paddle/fluid/operators/jitkernels/registry.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +void VXXJitCode::genCode() { + // do not need push stack, and do not need save avx512reg if do not use avx512 + int offset = 0; + if (with_relu_) { + vxorps(ymm_zero, ymm_zero, ymm_zero); + } + if (scalar_index_ == 1) { + vbroadcastss(ymm_src1, ptr[param1]); + } else if (scalar_index_ == 2) { + vbroadcastss(ymm_src2, ptr[param2]); + } + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + if (scalar_index_ != 1) { + vmovups(ymm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(ymm_src2, ptr[param2 + offset]); + } + if (type_ == operand_type::mul) { + vmulps(ymm_dst, ymm_src1, ymm_src2); + } else if (type_ == operand_type::add) { + vaddps(ymm_dst, ymm_src1, ymm_src2); + } + if (with_relu_) { + vmaxps(ymm_dst, ymm_zero, ymm_dst); + } + vmovups(ptr[param3 + offset], ymm_dst); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + int rest = num_ % YMM_FLOAT_BLOCK; + while (rest > 0) { + int block = XMM_FLOAT_BLOCK; + if (rest >= 4) { + block = 4; + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } + } else if (rest >= 2) { + block = 2; + if (scalar_index_ != 1) { + vmovq(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovq(xmm_src2, ptr[param2 + offset]); + } + } else { + block = 1; + if (scalar_index_ != 1) { + vmovss(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovss(xmm_src2, ptr[param2 + offset]); + } + } + switch (type_) { + case operand_type::mul: + vmulps(xmm_dst, xmm_src1, xmm_src2); + break; + case operand_type::add: + vaddps(xmm_dst, xmm_src1, xmm_src2); + break; + default: + break; + } + if (with_relu_) { + vmaxps(xmm_dst, xmm_zero, xmm_dst); + } + if (rest >= 4) { + vmovups(ptr[param3 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param3 + offset], xmm_dst); + } else { + vmovss(ptr[param3 + offset], xmm_dst); + } + offset += sizeof(float) * block; + rest -= block; + } + ret(); +} + +} // namespace jitcode + +template <> +std::unique_ptr CreateJitCode(int attr) { + if (UseJitCode(attr)) { + return make_unique( + attr, CodeSize(attr)); + } + return nullptr; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.h b/paddle/fluid/operators/jitkernels/jitcode/blas.h new file mode 100644 index 00000000000..a1aca97723e --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/blas.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) +class VXXJitCode : public JitCode { + public: + const char* name() const override { + std::string base = "VXXJitCode"; + if (scalar_index_ == 1) { + base += "_Scalar"; + } else { + base += "_Vec"; + } + if (type_ == operand_type::mul) { + base += "_Mul"; + } else if (type_ == operand_type::add) { + base += "_Add"; + } + if (scalar_index_ == 2) { + base += "_Scalar"; + } else { + base += "_Vec"; + } + base += (with_relu_ ? "_Relu" : ""); + return base.c_str(); + } + explicit VXXJitCode(int d, operand_type type, int scalar_index, + bool with_relu, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), + num_(d), + type_(type), + scalar_index_(scalar_index), + with_relu_(with_relu) {} + // static bool init(int d, int scalar_index = 0); + void genCode() override; + + private: + int num_; + operand_type type_; + int scalar_index_; + bool with_relu_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + reg64_t param3{abi_param3}; + + xmm_t xmm_src1 = xmm_t(0); + xmm_t xmm_src2 = xmm_t(1); + xmm_t xmm_dst = xmm_t(2); + xmm_t xmm_zero = xmm_t(3); + + ymm_t ymm_src1 = ymm_t(0); + ymm_t ymm_src2 = ymm_t(1); + ymm_t ymm_dst = ymm_t(2); + ymm_t ymm_zero = ymm_t(3); +}; + +class VMulJitCode : public VXXJitCode { + public: + explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr) + : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} +}; + +} // namespace jitcode +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index 7e0b6442edd..a3582e5284c 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -16,7 +16,7 @@ #include #include "paddle/fluid/operators/jitkernels/jitcode_base.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/platform/cpu_info.h" #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" @@ -30,23 +30,102 @@ namespace jitcode { // Application Binary Interface constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), - abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + abi_param4(Xbyak::Operand::RCX); -template -class VMulJitCode : public JitBase, public Xbyak::CodeGenerator { +constexpr Xbyak::Operand::Code g_abi_regs[] = { + Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, + Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}; + +constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]); + +using reg64_t = const Xbyak::Reg64; +using reg32_t = const Xbyak::Reg32; +using xmm_t = const Xbyak::Xmm; +using ymm_t = const Xbyak::Ymm; +using zmm_t = const Xbyak::Zmm; +using Label = Xbyak::Label; + +typedef enum { + mul = 0, + add, + sub, + relu, + exp, + sigmoid, + tanh, + identity +} operand_type; + +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +#define DECLARE_JIT_CODE(codename) \ + const char* name() const override { return #codename; } + +class JitCode : public JitBase, public Xbyak::CodeGenerator { public: - VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + explicit JitCode(size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { this->genCode(); } - virtual const char* name() const = 0; - virtual void genCode() = 0; - + size_t getSize() const override { return CodeGenerator::getSize(); } const unsigned char* getCodeInternal() override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } + + virtual const char* name() const = 0; + virtual void genCode() = 0; + + protected: + Xbyak::Reg64 param1{abi_param1}; + const int EVEX_max_8b_offt = 0x200; + const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + + virtual void preCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + push(Xbyak::Reg64(g_abi_regs[i])); + } + if (platform::jit::MayIUse(platform::jit::avx512f)) { + mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + } + } + virtual void postCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i])); + } + ret(); + } + void L(const char* label) { Xbyak::CodeGenerator::L(label); } + void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } + // Enhanced vector extension + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast = false) { + int scale = 0; + // Learn from https://github.com/intel/mkl-dnn + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + auto re = Xbyak::RegExp() + base + offt; + if (scale) { + re = re + reg_EVEX_max_8b_offt * scale; + } + if (bcast) { + return zword_b[re]; + } else { + return zword[re]; + } + } }; } // namespace jitcode diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jitkernels/jitcode_base.cc index 417c4d4b9e2..1da2af51f41 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.cc +++ b/paddle/fluid/operators/jitkernels/jitcode_base.cc @@ -13,6 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include +#include +#include DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -29,7 +32,7 @@ void JitBase::dumpCode(const unsigned char* code) const { counter++; std::ofstream fout(filename.str(), std::ios::out); if (fout.is_open()) { - fout.write(reinterpret_cast(code), getSize()); + fout.write(reinterpret_cast(code), this->getSize()); fout.close(); } } diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index a164746561e..ffec62163a7 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -28,7 +28,7 @@ namespace jitkernels { // TODO(TJ): make these functions as virtual of a class // Every JitCode should estimate the code size itself -template +template size_t CodeSize(Attr attr) { return 4096; } @@ -43,13 +43,11 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -class JitBase { +class JitBase : public Kernel { public: - JitBase() = default; - virtual ~JitBase() = default; virtual const char* name() const = 0; virtual const unsigned char* getCodeInternal() = 0; - + virtual size_t getSize() const = 0; template const FUNC getCode() { const unsigned char* code = this->getCodeInternal(); @@ -58,14 +56,17 @@ class JitBase { } return reinterpret_cast(code); } - DISABLE_COPY_AND_ASSIGN(JitBase); protected: - void dumpCode(const unsigned char* code); + void dumpCode(const unsigned char* code) const; }; -template -std::shared_ptr CreateJitCode(Attr attr); +template +std::unique_ptr CreateJitCode(Attr attr); //{ +// if (UseJitCode) { +// return make_unique(attr, CodeSize()); +// } +// } } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h index 866f72cce04..f398093dfe2 100644 --- a/paddle/fluid/operators/jitkernels/kernels.h +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -31,6 +31,9 @@ namespace jitkernels { template class JitCodePool { + typedef std::unique_ptr JitBasePtr; + typedef std::unordered_map JitBaseMap; + public: JitCodePool() = default; static JitCodePool& Instance() { @@ -38,29 +41,26 @@ class JitCodePool { return g_jit_codes; } - std::shared_ptr Get(size_t key) const { - if (codes_.find(key) == codes_.end()) { - return nullptr; - } - return codes_.at(key); - } + const JitBaseMap& AllKernels() { return codes_; } + + bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, const std::shared_ptr& value) { - codes_.insert({key, value}); + void Insert(size_t key, JitBasePtr value) { + codes_.emplace(key, std::move(value)); } private: - std::unordered_map> codes_; + JitBaseMap codes_; DISABLE_COPY_AND_ASSIGN(JitCodePool); }; // TODO(TJ): std::tuple -template -struct KernelAttr { - typedef T data_type; - typedef Func return_type; - typedef Attr attr_type; -}; +// template +// struct KernelAttr { +// typedef T data_type; +// typedef Func return_type; +// typedef Attr attr_type; +// }; typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> @@ -123,20 +123,21 @@ inline Func GetRefer() { // TODO(TJ): make tuple? named KernelAttr template -Func Get(Attr attr) { - // size_t key = GetKey(attr); - // auto jitcode = JitCodePool().Instance().Get(key); - // if (jitcode) { - // return jitcode->template getCode(); - // } - - if (std::is_same::value && - std::is_same::value) { // TODO(TJ): float move to create - // auto p = CreateJitCode(attr); - // if (p) { - // JitCodePool().Instance().Insert(key, p); - // return p->template getCode(); - // } +const Func Get(Attr attr) { + size_t key = GetKey(attr); + auto& codes = JitCodePool().Instance(); + if (codes.Has(key)) { + return codes.AllKernels().at(key)->template getCode(); + } + + if (std::is_same::value) { // TODO(TJ): float + // move to create + auto p = CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } } // pool: (KernelKey(type, place), vector) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 663a9fbf4e1..fd31ef77b46 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { // remove this namespace +namespace jit { typedef enum { isa_any, sse42, -- GitLab From b523787f9ffb0a97b1dda1c445f04e7490b7c4f5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 12:07:03 +0000 Subject: [PATCH 0051/2367] remove jit namespace test=develop --- paddle/fluid/operators/attention_lstm_op.cc | 16 +- .../fused/fused_embedding_fc_lstm_op.cc | 6 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 6 +- paddle/fluid/operators/math/cpu_vec.h | 148 +++++++++--------- paddle/fluid/operators/math/cpu_vec_test.cc | 54 ++++--- paddle/fluid/operators/math/jit_code.cc | 2 +- paddle/fluid/operators/math/jit_code.h | 2 +- paddle/fluid/operators/math/jit_gen.cc | 2 +- paddle/fluid/operators/math/jit_kernel.cc | 2 - .../fluid/operators/math/jit_kernel_blas.cc | 3 +- .../operators/math/jit_kernel_crf_decode.cc | 24 ++- paddle/fluid/operators/math/jit_kernel_exp.cc | 1 - .../operators/math/jit_kernel_layer_norm.cc | 22 ++- .../fluid/operators/math/jit_kernel_macro.h | 37 +++-- .../fluid/operators/math/jit_kernel_test.cc | 2 +- paddle/fluid/platform/cpu_info.cc | 2 - paddle/fluid/platform/cpu_info.h | 3 - paddle/fluid/platform/init.cc | 14 +- 18 files changed, 167 insertions(+), 179 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 9b943440a86..75fc59125f2 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { @@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 6d463538d23..1eb6523a2df 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); \ auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ + if (platform::MayIUse(platform::avx)) { \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 288b56fc248..17ed9771d07 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 7d81aee5969..e1e4d168db3 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,7 +77,7 @@ inline void vec_scal(const int n, const double a, double* x) { #endif // MKL scal only support inplace, choose this if src and dst are not equal -template +template inline void vec_scal(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a * x[i]; @@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); return; } const int rest = n % block; @@ -114,24 +114,24 @@ inline void vec_scal(const int n, const float a, y[i] = a * x[i]; } #else - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); #endif } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { - vec_scal(n, a, x, y); +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); } -template +template inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a - x[i]; @@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); return; } const int rest = n % block; @@ -168,27 +168,25 @@ inline void vec_bias_sub(const int n, const float a, y[i] = a - x[i]; } #else - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); #endif } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { - vec_bias_sub(n, a, x, y); +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); } // out = x*y + (1-x)*z -template +template inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { for (int i = 0; i < n; ++i) { out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; @@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { } template <> -inline void vec_cross(const int n, const float* x, - const float* y, const float* z, - float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); return; } const int rest = n % block; @@ -228,25 +226,26 @@ inline void vec_cross(const int n, const float* x, out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; } #else - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); #endif } template <> -inline void vec_cross(const int n, const float* x, - const float* y, - const float* z, float* out) { - vec_cross(n, x, y, z, out); +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { + vec_cross(n, x, y, z, out); } template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { // TODO(TJ): enable me - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); } -template +template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + a; @@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); return; } const int rest = n % block; @@ -283,32 +282,30 @@ inline void vec_add_bias(const int n, const float a, y[i] = x[i] + a; } #else - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); #endif } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { - vec_add_bias(n, a, x, y); +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); } -template +template inline void vec_identity(const int n, const T* x, T* y) { // do nothing return; } -template +template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; @@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); return; } const int rest = n % block; @@ -377,25 +374,24 @@ inline void vec_sigmoid(const int n, const float* x, y[i] = 1.f / (1.f + y[i]); } #else - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); #endif } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { - vec_sigmoid(n, x, y); +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); } -template +template inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); @@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { } // TODO(TJ): make relu clip -template +template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] > 0 ? x[i] : 0; @@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { - vec_relu(n, x, y); + vec_relu(n, x, y); return; } @@ -441,26 +437,26 @@ inline void vec_relu(const int n, const float* x, #undef MOVE_ONE_STEP #else - vec_relu(n, x, y); + vec_relu(n, x, y); #endif } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - vec_relu(n, x, y); +inline void vec_relu(const int n, const float* x, + float* y) { + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_relu(n, x, y); + vec_relu(n, x, y); } // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary -template +template class VecActivations { public: std::function operator()( diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index c37fa291a25..28eb9cadc9d 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function tgt, } TEST(CpuVecTest, sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -162,38 +166,40 @@ void TestInplace(const int n, std::function tgt, } TEST(CpuVecTest, inplace_sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, inplace_relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 52cbdf685de..78d0c3e8808 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -22,7 +22,7 @@ namespace math { namespace jitkernel { namespace gen { -using namespace platform::jit; // NOLINT +using namespace platform; // NOLINT bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index a9214621295..e2b47614355 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -179,7 +179,7 @@ class VActJitCode : public JitCode { template void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { - using namespace platform::jit; // NOLINT + using namespace platform; // NOLINT // check all idx can not equal JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc index 6af39518ed9..5c6672928e8 100644 --- a/paddle/fluid/operators/math/jit_gen.cc +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -36,7 +36,7 @@ void JitCode::preCode() { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc index 68b708b3453..118696ba479 100644 --- a/paddle/fluid/operators/math/jit_kernel.cc +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -21,8 +21,6 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - KernelPool& KernelPool::Instance() { static thread_local KernelPool g_jit_kernels; return g_jit_kernels; diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a0f93fd8e7e..8cf588efba5 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -30,7 +30,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML template @@ -125,7 +124,7 @@ bool VMulKernelImpl::useJIT(int d) { #ifdef PADDLE_WITH_MKLML template <> bool VMulKernelImpl::useMKL(int d) { - return jit::MayIUse(jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index 4d26b819482..eeb305a88be 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -25,10 +25,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* CRF Decode JitKernel */ -template +template class CRFDecodeKernelImpl : public CRFDecodeKernel { public: explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { @@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(YMM_FLOAT_BLOCK) \ @@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX512_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \ @@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); INTRIAVX_FLOAT(kGT16); #endif #ifdef __AVX2__ -INTRIAVX2_FLOAT(jit::avx2, kEQ8); -INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX2_FLOAT(jit::avx2, kEQ16); -INTRIAVX2_FLOAT(jit::avx2, kGT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ8); +INTRIAVX2_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ16); +INTRIAVX2_FLOAT(platform::avx2, kGT16); #endif #ifdef __AVX512F__ -INTRIAVX2_FLOAT(jit::avx512f, kEQ8); -INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx512f, kEQ8); +INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16); INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kGT16); #endif diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 686f3dd9836..7945cfb253a 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -29,7 +29,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index 49904e6e8c7..fead13ebadc 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -22,10 +22,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* Layer Norm JitKernel */ -template +template class LayerNormKernelImpl : public LayerNormKernel { public: explicit LayerNormKernelImpl(int right) : LayerNormKernel() { @@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel { this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } #ifdef __AVX__ -INTRIAVX_FLOAT(jit::avx, kEQ8); -INTRIAVX_FLOAT(jit::avx, kGT8LT16); -INTRIAVX_FLOAT(jit::avx, kEQ16); -INTRIAVX_FLOAT(jit::avx, kGT16); +INTRIAVX_FLOAT(platform::avx, kEQ8); +INTRIAVX_FLOAT(platform::avx, kGT8LT16); +INTRIAVX_FLOAT(platform::avx, kEQ16); +INTRIAVX_FLOAT(platform::avx, kGT16); #endif #ifdef __AVX2__ -INTRIAVX_FLOAT(jit::avx2, kEQ8); -INTRIAVX_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX_FLOAT(jit::avx2, kEQ16); -INTRIAVX_FLOAT(jit::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx2, kEQ8); +INTRIAVX_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX_FLOAT(platform::avx2, kEQ16); +INTRIAVX_FLOAT(platform::avx2, kGT16); #endif #undef INTRIAVX_FLOAT diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 5a3efd979f8..4dba3b56810 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -92,7 +92,6 @@ namespace jitkernel { JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_IMPL) -namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < YMM_FLOAT_BLOCK) { \ @@ -107,15 +106,15 @@ namespace jit = platform::jit; macro_(ker, dtype, isa, kGT16); \ } -#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ - if (jit::MayIUse(jit::avx512f)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ - } else if (jit::MayIUse(jit::avx2)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ - } else if (jit::MayIUse(jit::avx)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ - } else { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (platform::MayIUse(platform::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \ + } else if (platform::MayIUse(platform::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \ + } else if (platform::MayIUse(platform::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \ } #define JITKERNEL_KEY(ker_key, dtype_key) \ @@ -156,10 +155,10 @@ namespace jit = platform::jit; marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ - macro_(jit::avx512f, block); \ - macro_(jit::avx2, block); \ - macro_(jit::avx, block); \ - macro_(jit::isa_any, block) + macro_(platform::avx512f, block); \ + macro_(platform::avx2, block); \ + macro_(platform::avx, block); \ + macro_(platform::isa_any, block) #define FOR_EACH_BLOCK(macro_, isa) \ macro_(isa, kLT8); \ @@ -168,11 +167,11 @@ namespace jit = platform::jit; macro_(isa, kEQ16); \ macro_(isa, kGT16) -#define FOR_EACH_ISA_BLOCK(macro_) \ - FOR_EACH_BLOCK(macro_, jit::avx512f); \ - FOR_EACH_BLOCK(macro_, jit::avx2); \ - FOR_EACH_BLOCK(macro_, jit::avx); \ - FOR_EACH_BLOCK(macro_, jit::isa_any) +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, platform::avx512f); \ + FOR_EACH_BLOCK(macro_, platform::avx2); \ + FOR_EACH_BLOCK(macro_, platform::avx); \ + FOR_EACH_BLOCK(macro_, platform::isa_any) } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index ed86a47e159..19f7bd89094 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -705,7 +705,7 @@ TEST(JitKernel, pool) { jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); // empty call it to avoid unknown flag 'use_pinned_memory' on Mac - paddle::platform::jit::MayIUse(paddle::platform::jit::avx); + paddle::platform::MayIUse(paddle::platform::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index d466f28d1ea..f9a32bfa4c1 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -namespace jit { #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { @@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } #endif -} // namespace jit } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b46..55dba545ff1 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { typedef enum { isa_any, sse42, @@ -55,7 +54,5 @@ typedef enum { // May I use some instruction bool MayIUse(const cpu_isa_t cpu_isa); -} // namespace jit - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 51b46450e41..0d10d82d74a 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::jit::MayIUse(platform::jit::avx)) { + if (platform::MayIUse(platform::avx)) { #ifndef __AVX__ LOG(WARNING) << "AVX is available, Please re-compile on local machine"; #endif @@ -131,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { " version or compile from source code." #ifdef __AVX512F__ - if (!platform::jit::MayIUse(platform::jit::avx512f)) { - if (platform::jit::MayIUse(platform::jit::avx2)) { + if (!platform::MayIUse(platform::avx512f)) { + if (platform::MayIUse(platform::avx2)) { AVX_GUIDE(AVX512, AVX2); - } else if (platform::jit::MayIUse(platform::jit::avx)) { + } else if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX512, AVX); } else { AVX_GUIDE(AVX512, NonAVX); @@ -143,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX2__ - if (!platform::jit::MayIUse(platform::jit::avx2)) { - if (platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx2)) { + if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX2, AVX); } else { AVX_GUIDE(AVX2, NonAVX); @@ -153,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX__ - if (!platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX, NonAVX); } #endif -- GitLab From a1eb21e704b570eaf5f5bf571f85ffcc2bc613b0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 12:24:19 +0000 Subject: [PATCH 0052/2367] refine names --- paddle/fluid/operators/jitkernels/CMakeLists.txt | 6 ++++-- paddle/fluid/operators/jitkernels/jitcode/jitcode.h | 2 +- .../operators/jitkernels/{kernels.cc => kernel_pool.cc} | 2 +- .../fluid/operators/jitkernels/{kernels.h => kernel_pool.h} | 0 paddle/fluid/operators/jitkernels/more/mkl/mkl.h | 2 +- paddle/fluid/operators/jitkernels/registry.h | 2 +- paddle/fluid/operators/jitkernels/test.cc | 2 +- 7 files changed, 9 insertions(+), 7 deletions(-) rename paddle/fluid/operators/jitkernels/{kernels.cc => kernel_pool.cc} (94%) rename paddle/fluid/operators/jitkernels/{kernels.h => kernel_pool.h} (100%) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index e82e6c3026f..f6bb3e0712f 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -7,7 +7,9 @@ set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) -cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS}) +file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +list(REMOVE_ITEM jit_kernel_cc_srcs jit_test.cc) +cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) @@ -15,5 +17,5 @@ if(WITH_XBYAK) add_subdirectory(jitcode) endif() -cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_library(jit_kernel SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index a3582e5284c..03c2100ca05 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -92,7 +92,7 @@ class JitCode : public JitBase, public Xbyak::CodeGenerator { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernel_pool.cc similarity index 94% rename from paddle/fluid/operators/jitkernels/kernels.cc rename to paddle/fluid/operators/jitkernels/kernel_pool.cc index 35095220e39..9bb0ba349bc 100644 --- a/paddle/fluid/operators/jitkernels/kernels.cc +++ b/paddle/fluid/operators/jitkernels/kernel_pool.cc @@ -12,7 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" #include // for shared_ptr #include #include diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernel_pool.h similarity index 100% rename from paddle/fluid/operators/jitkernels/kernels.h rename to paddle/fluid/operators/jitkernels/kernel_pool.h diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h index 7cb4334e503..75ed34ef48e 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -41,7 +41,7 @@ class VMulKernel VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { if (std::is_same::value) { - return platform::jit::MayIUse(platform::jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } else { return true; } diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index 62a0de36410..cd414bb096c 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/variant.h" // for UNUSED diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index d11c7afe9ab..eb0d30eecdb 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -19,7 +19,7 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" // TODO(TJ): remove me #include "paddle/fluid/operators/jitkernels/registry.h" -- GitLab From 722b0a805f99015a65b19721b5ad0ca9420a3ba6 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 5 Dec 2018 15:44:41 +0000 Subject: [PATCH 0053/2367] fix bug of trt pool test=develop --- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 343fd3f7c5a..768318fb063 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -109,8 +109,8 @@ class Pool2dOpConverter : public OpConverter { } if (pool_type == "max") { - nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]); - nvinfer1::DimsHW post_pad(paddings[0], paddings[1]); + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, -- GitLab From 05208e1f2bd7fb77b5427353edc8c3f28ef9a23b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 6 Dec 2018 13:00:28 +0800 Subject: [PATCH 0054/2367] optimize code test=develop --- paddle/fluid/operators/reader/ctr_reader.cc | 3 ++- paddle/fluid/operators/reader/read_op.cc | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index ca9a58615e0..e8edbf6602c 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -261,7 +261,8 @@ static inline void parse_csv_line( string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { - (*sparse_datas)[i].push_back(std::stol(data_str)); + auto id = std::stol(data_str); + (*sparse_datas)[i].push_back(id); } } } diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 97faade0428..8fe638ac2fd 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -36,7 +36,6 @@ class ReadInferShape : public framework::InferShapeBase { ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); - std::cout << in_desc->Proto()->SerializeAsString() << std::endl; auto in_lod_levels = in_desc->GetLoDLevels(); auto out_var_ptrs = ctx->GetOutputVarPtrs("Out"); PADDLE_ENFORCE_EQ(in_lod_levels.size(), out_var_ptrs.size(), -- GitLab From 570d89ec84296dd46725be4f854808e0f1fb5f1c Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 16:52:59 +0800 Subject: [PATCH 0055/2367] add bpr_loss operator , test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 149 ++++++++++++++++++ paddle/fluid/operators/bpr_loss_op.h | 142 +++++++++++++++++ python/paddle/fluid/layers/nn.py | 13 ++ .../fluid/tests/unittests/test_bpr_loss_op.py | 53 +++++++ 4 files changed, 357 insertions(+) create mode 100644 paddle/fluid/operators/bpr_loss_op.cc create mode 100644 paddle/fluid/operators/bpr_loss_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_bpr_loss_op.py diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc new file mode 100644 index 00000000000..3e6445dbc26 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bpr_loss_op.h" + +namespace paddle { +namespace operators { + +class BprLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), + "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_Pos_dims = ctx->GetInputDim("Label_Pos"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ( + rank, label_Pos_dims.size(), + "Input(X) and Input(Label_Pos) shall have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_Pos_dims, 0, rank - 1), + "Input(X) and Input(Label_Pos) shall have the same shape " + "except the last dimension."); + + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of Seq-bpr + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), + "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_pos_dims = ctx->GetInputDim("Label_Pos"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ( + label_pos_dims.size(), rank, + "Input(Label_Pos) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_pos_dims, 0, rank - 1), + "The Input(X) and Input(Label_Pos) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, + " the last dimension of Input(Label_Pos) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "real number."); + AddInput( + "Label_Pos", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. the last dimension " + "size is 1."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the sequence bpr loss."); + AddComment(R"DOC( +BprLoss Operator. + +This operator belongs to pairwise ranking loss. Label_pos is the desired item. +The loss at a given point in one seesion is defined as: +$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ + +Learn more details by reading paper . + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); +REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, + ops::BprLossOpKernel); +REGISTER_OP_CPU_KERNEL(bpr_loss_grad, + ops::BprLossGradientOpKernel, + ops::BprLossGradientOpKernel); diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h new file mode 100644 index 00000000000..4103686de77 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class BprLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* labels_Pos = ctx.Input("Label_Pos"); + auto* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + int rank = x->dims().size(); + + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_Pos_2d = framework::ReshapeToMatrix(*labels_Pos, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + + const framework::Tensor* prob = &x_2d; + const framework::Tensor* labels_pos = &labels_Pos_2d; + framework::Tensor* out = &y_2d; + + const int step_size = prob->dims()[0]; + const int class_num = prob->dims()[1]; + const T* prob_data = prob->data(); + T* loss_data = out->data(); + + const int64_t* label_pos_data = labels_pos->data(); + for (int i = 0; i < step_size; ++i) { + int lbl_pos = label_pos_data[i]; + PADDLE_ENFORCE_GE(lbl_pos, 0); + PADDLE_ENFORCE_LT(lbl_pos, class_num); + int index_pos = i * class_num + lbl_pos; + T sum = static_cast(0); + for (int j = 0; j < class_num; j++) { + if (j == lbl_pos) continue; + int index_neg = i * class_num + j; + sum += TolerableValue()(-std::log( + 1.0f + TolerableValue()( + std::exp(prob_data[index_neg] - prob_data[index_pos])))); + } + loss_data[i] = -sum / (class_num - 1); + } + } +}; + +template +class XeGradFunctor { + public: + XeGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const int64_t* label_pos, // NOLINT + size_t num_classes) + : dx_(dx), + dy_(dy), + x_(x), + label_pos_(label_pos), + num_classes_(num_classes) {} + + HOSTDEVICE void operator()(size_t sample_id) { + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; ++x_offset) { + dx_[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; + for (size_t ni = 0; ni < num_classes_; ni++) { + if (label_pos_[sample_id] == ni) continue; + auto n_index = sample_id * num_classes_ + ni; + auto grad_ = + -dy_[sample_id] / + ((num_classes_ - 1) * + (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); + dx_[p_index] += grad_; + dx_[n_index] -= grad_; + } + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const int64_t* label_pos_; + size_t num_classes_; +}; + +template +class BprLossGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label_pos = ctx.Input("Label_Pos"); + auto* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + + int rank = x->dims().size(); + int64_t class_num = x->dims()[rank - 1]; + XeGradFunctor functor(dx_data, dy->data(), x->data(), + label_pos->data(), + static_cast(class_num)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dy->numel())); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4df74edfceb..6d05ca8461b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -41,6 +41,7 @@ __all__ = [ 'crf_decoding', 'cos_sim', 'cross_entropy', + 'bpr_loss', 'square_error_cost', 'chunk_eval', 'sequence_conv', @@ -1175,6 +1176,18 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): return out +def bpr_loss(input, label_pos): + + helper = LayerHelper('bpr_loss', **locals()) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type='bpr_loss', + inputs={'X': [input], + 'Label_Pos': [label_pos]}, + outputs={'Y': [out]}) + return out + + def square_error_cost(input, label): """ **Square error cost layer** diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py new file mode 100644 index 00000000000..7e18913a03b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest, randomize_probability + + +class TestBprLossOp1(OpTest): + """Test BprLoss with discrete one-hot labels. + """ + + def setUp(self): + self.op_type = "bpr_loss" + batch_size = 3 + class_num = 5 + X = randomize_probability(batch_size, class_num, dtype='float64') + label_pos = np.random.randint( + 0, class_num, (batch_size, 1), dtype="int64") + bpr_loss_result = [] + for i in range(batch_size): + sum = 0.0 + for j in range(class_num): + if j == label_pos[i][0]: + continue + sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) + bpr_loss_result.append(-sum / (class_num - 1)) + bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") + self.inputs = {"X": X, "Label_Pos": label_pos} + self.outputs = {"Y": bpr_loss} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 4cb0100c8ea714e4ce7f8c0cd3c9ebc50aff9e35 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 16:59:53 +0800 Subject: [PATCH 0056/2367] add prefetch in nce --- paddle/fluid/operators/nce_op.cc | 18 +++++ paddle/fluid/operators/nce_op.h | 67 ++++++++++++++++--- .../fluid/transpiler/distribute_transpiler.py | 2 +- 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821dd..06ff825fde3 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -155,6 +155,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_sparse", "(boolean, default false) Sparse update.") .SetDefault(false); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index f2ca6ec247f..8f82f77f501 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -144,15 +146,64 @@ class NCEKernel : public framework::OpKernel { } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); - auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = - (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * - weight_mat.chip(sample_labels_data[i], 0)) - .sum(); - sample_out_data[i] += result(0); - sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + + std::vector labels; + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + labels.push_back(sample_labels_data[i]); + } + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + auto &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + framework::Variable *ids = local_scope.Var("Ids"); + framework::Variable *weight = local_scope.Var("Weight"); + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch("Ids", "Weight", table_names, epmap, + height_sections, context); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); + + auto weight_mat = EigenMatrix::From(*(weight->Get())); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + std::vector::iterator it = + std::find(labels.begin(), labels.end(), sample_labels_data[i]); + int idx = std::distance(labels.begin(), it); + + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(idx, 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } +#endif + } else { + auto weight_mat = + EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } } + // forward cost for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { out_data[i] = 0; diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1d867d91943..817af602bd5 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -239,7 +239,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table"] + sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True and not op.attr( -- GitLab From 627a6b8bacc5f4898c1c3c9018fd8e70ef95d8dc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:14:59 +0800 Subject: [PATCH 0057/2367] add prefetch in nce --- paddle/fluid/operators/nce_op.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 8f82f77f501..7397d9f4735 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -26,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -166,8 +170,8 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - framework::Variable *ids = local_scope.Var("Ids"); - framework::Variable *weight = local_scope.Var("Weight"); + local_scope.Var("Ids"); + local_scope.Var("Weight"); #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch("Ids", "Weight", table_names, epmap, -- GitLab From 7fa2e821e470411b75ba0f53a3759fa007391745 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:53:05 +0800 Subject: [PATCH 0058/2367] add local scope in nce --- paddle/fluid/operators/nce_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 7397d9f4735..afb14c30713 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -194,6 +194,8 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } + + context.scope().DeleteScope(&local_scope); #endif } else { auto weight_mat = -- GitLab From 93551a3440d719f161b6b39309c90cdd19218d75 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 17:54:19 +0800 Subject: [PATCH 0059/2367] update API.spec --- paddle/fluid/API.spec | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078eb..e273a852a95 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,13 +32,6 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -66,6 +59,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label_pos'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) @@ -76,7 +70,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -182,7 +176,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -194,9 +188,6 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -301,7 +292,6 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -422,17 +412,3 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) -paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) -paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) -paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) -paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) -paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) -paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) -paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) -paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) -paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) -paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) -- GitLab From b51df398749af98a40d7be49b2a7d1cc7bc3f128 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 17:58:22 +0800 Subject: [PATCH 0060/2367] update , test=develop --- python/paddle/fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 7e18913a03b..2af6461aedb 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -25,7 +25,7 @@ class TestBprLossOp1(OpTest): def setUp(self): self.op_type = "bpr_loss" - batch_size = 3 + batch_size = 4 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') label_pos = np.random.randint( -- GitLab From f60e55c04681e7ed900687a42b5ce95f8ba3a6b5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 6 Dec 2018 19:02:45 +0800 Subject: [PATCH 0061/2367] add ctr_reader to api spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 26113ee7e90..9d0fad75ba3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -333,6 +333,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_indexs', 'sparse_slot_indexs', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -- GitLab From ce674b685f89a66ef0b9163b76b456a9580e5f41 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Dec 2018 05:24:50 +0000 Subject: [PATCH 0062/2367] add readme doc and complete TODOs --- paddle/fluid/operators/jitkernels/README.md | 46 ++++++++++++++++++- .../fluid/operators/jitkernels/jitcode_base.h | 6 +-- .../fluid/operators/jitkernels/kernel_base.h | 13 ++++-- .../fluid/operators/jitkernels/kernel_pool.h | 14 +----- .../fluid/operators/jitkernels/more/mkl/mkl.h | 11 +---- .../fluid/operators/jitkernels/refer/refer.h | 4 +- paddle/fluid/operators/jitkernels/registry.h | 2 +- paddle/fluid/operators/jitkernels/test.cc | 8 ++-- 8 files changed, 65 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md index 3401e9be531..fd6428b43ec 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jitkernels/README.md @@ -1,4 +1,46 @@ -TBD +# JIT Kernel + +结合函数模板和JIT生成需要的kernel函数。 +这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。 +目前仅支持CPU上的高性能计算。 + +## 目录结构 + +```txt +PaddlePaddle/Paddle/paddle/fluid/ +├── ... +├── operator/ +│ ├── .../ +└── jit/ + ├── ... + ├── jitcode/ + │ └── ... + |── more/ + │ ├── ... + │ ├── mkl/ + │ │ └── ... + │ └── openblas/ + │ └── ... + └── refer/ + └── ... +``` + +基础class都的根目录下,根目录下包括jitcode,more和refer。每个目录下都是一种实现,每种kernel算子都需要有reference的实现,其他的都是可选的。 +- jitcode: 代表使用jit生成的code,需要依赖xbyak。他关心的是性能。 +- refer:代表reference的实现,每种kernel算子都需要有在CPU上的reference的实现,他主要关心的算法逻辑。 +- more: 下面可以放入跟多实现,包括mkl,mkldnn,openblas等,也可以是自身已有的kernel组合。 -# Use me +## 动态获取 + +提供一个get方法,根据kernel类别获取,每种实现都有自己的使用范围,根据范围动态和当前条件选择需要的kernel函数。 + +## 测试 + +- 逻辑测试 + 所有实现都要与refer的code对比,需要满足精度要求 +- 性能测试 + +# 如何添加新的算子 +TBD +## Use me Add USE_JIT_KERNEL(yourname) to CMakefile. diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index ffec62163a7..de8aaf229fe 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -62,11 +62,7 @@ class JitBase : public Kernel { }; template -std::unique_ptr CreateJitCode(Attr attr); //{ -// if (UseJitCode) { -// return make_unique(attr, CodeSize()); -// } -// } +std::unique_ptr CreateJitCode(Attr attr); } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h index eeaa0617cb8..6fbb0f9f7ea 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -21,6 +21,13 @@ namespace jitkernels { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; +template +struct VMulTypes { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, const T*, T*, int); +}; + // Just for adding to kernel pool without template class Kernel { public: @@ -29,10 +36,10 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template // TODO(TJ): use tuple +template class KernelImpl : public Kernel { public: - using ELEMENT_TYPE = T; // TODO(TJ): remove me? + using ELEMENT_TYPE = T; virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; @@ -40,7 +47,7 @@ class KernelImpl : public Kernel { Func func{nullptr}; }; -template // TODO(TJ): use tuple +template class ReferKernel : public KernelImpl { public: // Refer code can always be used diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.h b/paddle/fluid/operators/jitkernels/kernel_pool.h index f398093dfe2..901a891cb38 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.h +++ b/paddle/fluid/operators/jitkernels/kernel_pool.h @@ -27,8 +27,6 @@ namespace paddle { namespace operators { namespace jitkernels { -// TODO(TJ): rename file to kernel_pool - template class JitCodePool { typedef std::unique_ptr JitBasePtr; @@ -54,14 +52,6 @@ class JitCodePool { DISABLE_COPY_AND_ASSIGN(JitCodePool); }; -// TODO(TJ): std::tuple -// template -// struct KernelAttr { -// typedef T data_type; -// typedef Func return_type; -// typedef Attr attr_type; -// }; - typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> KernelMap; @@ -120,7 +110,6 @@ inline Func GetRefer() { return nullptr; } -// TODO(TJ): make tuple? named KernelAttr template const Func Get(Attr attr) { @@ -130,8 +119,7 @@ const Func Get(Attr attr) { return codes.AllKernels().at(key)->template getCode(); } - if (std::is_same::value) { // TODO(TJ): float - // move to create + if (std::is_same::value) { auto p = CreateJitCode(attr); if (p) { auto f = p->template getCode(); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h index 75ed34ef48e..9cf032db43f 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -27,16 +27,9 @@ namespace mkl { template void VMul(const T* x, const T* y, T* z, int n); -// template -// struct VMulTypes{ -// typedef T date_type; -// typedef void (*func)(const T*, const T*, T*, int) func_type; -// typedef int attr_type; -// }; - template -class VMulKernel - : public KernelImpl { +class VMulKernel : public KernelImpl::func_type, + typename VMulTypes::attr_type> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h index 163c6d73dce..796f58d4017 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -29,8 +29,8 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel - : public ReferKernel { +class VMulKernel : public ReferKernel::func_type, + typename VMulTypes::attr_type> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index cd414bb096c..6d817461bec 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -26,7 +26,7 @@ namespace paddle { namespace operators { namespace jitkernels { -// make_unique is supported from c++14 +// make_unique is supported since c++14 template inline std::unique_ptr make_unique(Args&&... args) { static_assert(!std::is_array::value, "T must not be array"); diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index eb0d30eecdb..d27b5d1cbae 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -69,10 +69,10 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jitkernels; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer(); - auto tgt = jit::Get(d); + auto ref = jit::GetRefer::func_type, + jit::VMulTypes::attr_type>(); + auto tgt = jit::Get::func_type, + jit::VMulTypes::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(tgt != nullptr); -- GitLab From c9de6f1b05aa428d5e6ad9c16db5c2ca8c12cdc7 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 6 Dec 2018 21:16:10 +0800 Subject: [PATCH 0063/2367] init parallel graph mode --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../framework/details/all_reduce_op_handle.cc | 28 +++- .../fluid/framework/details/build_strategy.cc | 1 + .../details/computation_op_handle.cc | 12 +- .../framework/details/computation_op_handle.h | 1 + .../framework/details/execution_strategy.h | 2 +- .../details/multi_devices_graph_pass.cc | 8 +- .../fluid/framework/details/op_handle_base.cc | 3 +- .../fluid/framework/details/op_handle_base.h | 1 - .../details/parallel_ssa_graph_executor.cc | 66 ++++++++++ .../details/parallel_ssa_graph_executor.h | 51 +++++++ .../scope_buffered_ssa_graph_executor.cc | 41 +++--- .../scope_buffered_ssa_graph_executor.h | 5 +- .../details/threaded_ssa_graph_executor.h | 1 + paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 124 +++++++++++++----- paddle/fluid/framework/parallel_executor.h | 2 + paddle/fluid/framework/scope.cc | 5 +- paddle/fluid/framework/threadpool.cc | 16 ++- paddle/fluid/framework/threadpool.h | 4 +- paddle/fluid/framework/threadpool_test.cc | 44 +++++++ .../fluid/operators/reader/blocking_queue.h | 3 + .../fluid/operators/reader/buffered_reader.cc | 5 + .../reader/create_double_buffer_reader_op.cc | 14 +- .../operators/reader/create_py_reader_op.cc | 2 + .../fluid/operators/reader/open_files_op.cc | 2 + paddle/fluid/platform/nccl_helper.h | 7 +- paddle/fluid/platform/profiler.cc | 12 +- paddle/fluid/pybind/pybind.cc | 24 ++-- 30 files changed, 399 insertions(+), 91 deletions(-) create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad630..b419c8c292a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -177,7 +177,7 @@ else() endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fea..65247533228 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,6 +54,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160e..ae17ea8a154 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,20 +46,27 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + int64_t start_ts = GetTS(); + int64_t func_ts = GetTS(); + VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { + local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else if (NoDummyInputSize() == 1) { #endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done + start_ts = GetTS(); WaitInputVarGenerated(); + VLOG(5) << "all_reduce_op_handle wait input var spent: " + << GetTS() - start_ts << " (ns)."; + start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -100,6 +107,8 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; + VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ + << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -110,11 +119,20 @@ void AllReduceOpHandle::RunImpl() { }); } this->RunAndRecordEvent([&] { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); + // TODO(Yancey1989): need allreduce operator to avoid this flag + if (nccl_ctxs_->need_group_call_) { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } else { + // only used in executor_type == ParallalGraph, one thread one GPU + // TODO(Yancey1989): use allreduce operator to avoid this tricky. + PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); + all_reduce_calls[0](); } }); + #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -144,6 +162,8 @@ void AllReduceOpHandle::RunImpl() { } } } + VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts + << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1e1b945f63c..04c1061536b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,6 +118,7 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600..35ba99a8793 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,10 +33,18 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (is_lock_and_record_event_free_) { + if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { + int64_t start_ts = GetTS(); + auto varname = DynamicCast(this->Outputs())[0]->name_; run_func(); + VLOG(5) << Name() << "_op_handle: " << varname + << " spent: " << GetTS() - start_ts << " (ns)."; } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4d..5346b56dd63 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2..d3d5b6bf541 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1 }; + enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index cbae5321d9a..1bd238357a7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); + // int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -329,6 +329,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { + VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -365,9 +366,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. + VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -386,7 +389,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b..d68d1ce71d0 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,6 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA + int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -52,7 +53,6 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif - RunImpl(); } @@ -125,6 +125,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event + VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61c..88c78e0678c 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -26,7 +26,6 @@ namespace framework { namespace details { constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; - // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. class OpHandleBase { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 00000000000..72beb74aa48 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + places_(std::move(places)), + graphs_(std::move(graphs)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + for (size_t i = 0; i < places.size(); ++i) { + std::vector scopes = {local_scopes_[i]}; + std::vector places = {places_[i]}; + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, scopes, places, std::move(graphs_[i]))); + } +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + FeedFetchList fetch_data; + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i] { + // FIXME(Yancey1989): need to fix fetch data failed. + std::vector empty; + executors_[i]->Run(empty); + }; + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + call(); + } + } + if (pool_) { + for (auto &f : run_futures) { + f.wait(); + } + } + return fetch_data; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 00000000000..c0ba1577f7f --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs); + ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector places_; + std::vector> graphs_; + std::unique_ptr<::ThreadPool> pool_; + + std::vector> executors_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a9856..abc6b9f559e 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,39 +27,40 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_infos_list, + std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_(std::move(var_infos)), + var_infos_list_(std::move(var_infos_list)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { - auto &scope = *it; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &scope = local_scopes_[i]; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - - for (auto &info : var_infos_) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); + for (auto &var_infos : var_infos_list_) { + for (auto &info : var_infos) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } } } } } std::vector fetch_data; - std::exception_ptr eptr; + std::exception_ptr eptr = nullptr; try { fetch_data = underlying_executor_->Run(fetch_tensors); } catch (...) { @@ -71,9 +72,13 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( #ifdef PADDLE_WITH_CUDA const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; + DeviceGarbageCollectorMap *gc = nullptr; + // FIXME(Yancey1989): need to fix gc failed on parallel graph mode + if (strategy_.type_ != ExecutionStrategy::kParallelGraph) { + gc = Graph().Has(gc_name) + ? &(Graph().Get(gc_name)) + : nullptr; + } #endif if (!fetch_tensors.empty() || diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50b..51230d4a42a 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,7 +38,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_info_list, + std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -53,7 +54,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector var_infos_; + std::vector> var_infos_list_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 24da56c09e3..b45afbc0461 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 30da029ca2a..7de6025a28a 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee772..ff3d76fb01c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -53,6 +54,7 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; + std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -84,6 +86,9 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); + PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, + "You should set build_strategy.reduce with 'AllReduce' for " + "ParallelGraph executor type"); } // Step 1. Bcast the params to devs. @@ -106,31 +111,55 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; + std::unique_ptr nccl_id = nullptr; + bool need_group_call = true; if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id.reset(nccl_id_var->GetMutable()); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + nccl_id.reset(new ncclUniqueId()); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id.get(); + need_group_call = false; + } else { + // init nccl_id in NCCLContextMap } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id.get(), num_trainers, trainer_id, + need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, params, + {member_->local_scopes_[i]}, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } auto max_memory_size = GetEagerDeletionThreshold(); - if (max_memory_size >= 0) { + // FIXME(Yancey1989): need to fix on parallel graph mode + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { for (auto &place : member_->places_) { if (!platform::is_gpu_place(place)) continue; auto gpu_place = boost::get(place); @@ -143,40 +172,48 @@ ParallelExecutor::ParallelExecutor( } } if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); + for (size_t i = 0; i < graphs.size(); ++i) { + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[0]->SetNotOwned("garbage_collector", &gcs_); + } } } #else std::unique_ptr graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); + graphs.push_back(std::move(graph)); #endif // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars - std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + std::vector> var_infos_list; + for (size_t i = 0; i < graphs.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } + var_infos_list.emplace_back(std::move(var_infos)); } + // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -185,15 +222,42 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + /** + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + + std::vector places = {member_->places_[i]}; + std::vector scopes = {member_->local_scopes_[i]}; + std::unique_ptr p(new + details::ThreadedSSAGraphExecutor( + exec_strategy, scopes, places, std::move(graphs[i]))); + + member_->executors_.push_back(std::move(p)); + + member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, scopes, std::move(var_infos), places, + std::move(member_->executors_[i]))); + }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + member_->executor_.reset(new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, places, graphs)); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos), + exec_strategy, member_->local_scopes_, std::move(var_infos_list), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2aa..319701f1eb8 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,6 +20,8 @@ limitations under the License. */ #include #include +#include "ThreadPool.h" + #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc..873f68e42e4 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,7 +58,10 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { DropKids(); } +Scope::~Scope() { + VLOG(5) << "~Scope()"; + DropKids(); +} Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360f..7dc7430c55b 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,9 +48,18 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (auto& thread : threads_) { + for (int i = 0; i < num_threads; ++i) { + // for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + threads_[i].reset( + new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); + /** + sched_param sch; + int policy; + pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); + if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { + VLOG(1) << "Failed to setschedparam: " << errno; + }**/ } } @@ -68,7 +77,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop() { +void ThreadPool::TaskLoop(int i) { while (true) { Task task; @@ -89,7 +98,6 @@ void ThreadPool::TaskLoop() { task = std::move(tasks_.front()); tasks_.pop(); } - // run the task task(); } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 7a51d18fbbf..bd8c3cdee8c 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include // NOLINT #include #include // NOLINT @@ -27,7 +28,6 @@ limitations under the License. */ namespace paddle { namespace framework { - struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(); + void TaskLoop(int i); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 884d61e2342..1257a76e3e6 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,3 +59,47 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } +static int64_t GetTS() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000000 + tp.tv_usec; +} + +void multi_call(std::function call) { + for (int i = 0; i < 500; ++i) { + call(); + } +} + +TEST(ThreadPool, PERFORMANCE) { + auto sum = [] { + int a = 0; + for (int i = 0; i < 1000; ++i) { + a += i; + } + }; + // framework::ThreadPool *pool = new framework::ThreadPool(2); + int64_t start = GetTS(); + for (int i = 0; i < 1000; ++i) { + // int64_t s = GetTS(); + framework::Async(std::move(sum)); + // pool->Run(std::move(sum)); + // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; + } + VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; + start = GetTS(); + for (int i = 0; i < 1000; ++i) { + sum(); + } + VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; + std::vector threads; + start = GetTS(); + for (int i = 0; i < 2; ++i) { + std::thread t(multi_call, std::ref(sum)); + threads.push_back(std::move(t)); + } + for (auto& thread : threads) { + thread.join(); + } + VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; +} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5a..10de11bfa5c 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,9 +67,12 @@ class BlockingQueue { } bool Receive(T* elem) { + VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); + VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { + if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa0..2d66000f1f8 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,7 +58,9 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { + VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); + VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } @@ -80,11 +82,13 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { + VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); + VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -101,6 +105,7 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; + VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index ed719f91d09..924c92e0bfd 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,9 +25,15 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); + VLOG(1) << "find var in scope: " << &scope; + auto* out_var = scope.FindVar(Output("Out")); + VLOG(1) << "var " << Output("Out") << " -> " << out_var; + auto* out = out_var->GetMutable(); + + // auto* out = scope.Var(Output("Out")) + // ->template GetMutable(); if (out->Get() != nullptr) { + VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -46,9 +52,11 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - + VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); + VLOG(1) << "Reset Buffered Reader in var: " + << scope.FindVar(Input("UnderlyingReader")); } }; diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b5..093b0e56b3d 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,8 +28,10 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { + VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); + VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 38223e06997..ae37a187259 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,10 +115,12 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { + VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { + VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 7c539d25f6d..53de53f43df 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,12 +82,15 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; + bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0) { + size_t num_trainers = 1, size_t trainer_id = 0, + bool need_group_call = true) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); + need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); @@ -102,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1) { + if (num_trainers == 1 && nccl_id != nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 998242fb4a0..040a68f6726 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/port.h" - #include #include #include @@ -25,9 +22,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA + #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; + std::lock_guard l(profiler_mu); + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; @@ -184,8 +185,9 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; + VLOG(5) << "call ~RecordEvent"; + std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d2974..c313ed2a8b9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -720,6 +720,11 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); + py::enum_(exec_strategy, "ExecutorType") + .value("Default", ExecutionStrategy::ExecutorType::kDefault) + .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) + .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); + exec_strategy.def(py::init()) .def_property( "num_threads", @@ -777,17 +782,14 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); + }) + .def_property( + "executor_type", + [](const ExecutionStrategy &self) { return self.type_; }, + [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { + self.type_ = type; + }, + R"DOC()DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to -- GitLab From 42d1b3f7861becfe5f8179cf8c0902b006b5500c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 6 Dec 2018 23:22:38 +0800 Subject: [PATCH 0064/2367] Fix numpy bug on Ubuntu16 and Ubuntu18 which will cause segmentfault test=develop --- python/paddle/dataset/image.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 19fc229e6fa..645d54a2601 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -32,11 +32,27 @@ the image layout as follows. from __future__ import print_function +import six import numpy as np -try: - import cv2 -except ImportError: - cv2 = None +# NOTE(minqiyang): this is an ugly fix for the numpy bug reported here +# https://github.com/numpy/numpy/issues/12497 +if six.PY3: + import subprocess + import sys + import_cv2_proc = subprocess.Popen([sys.executable, "-c", "import cv2"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = import_cv2_proc.communicate() + retcode = import_cv2_proc.poll() + if retcode != 0: + cv2 = None + else: + import cv2 +else: + try: + import cv2 + except ImportError: + cv2 = None import os import tarfile import six.moves.cPickle as pickle -- GitLab From 387bac46b5e4d95e2888773975d1b6c3a906a588 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:09:43 +0000 Subject: [PATCH 0065/2367] refine code test=develop --- .../framework/details/eager_deletion_pass.cc | 10 +- .../fluid/framework/details/op_graph_view.cc | 2 + .../framework/details/reference_count_pass.cc | 14 +- .../details/reference_count_pass_helper.h | 10 +- .../scope_buffered_ssa_graph_executor.cc | 8 +- .../scope_buffered_ssa_graph_executor.h | 2 +- paddle/fluid/framework/executor.cc | 14 +- paddle/fluid/framework/executor.h | 6 +- paddle/fluid/framework/parallel_executor.cc | 153 ++++++++++-------- .../fluid/operators/controlflow/while_op.cc | 10 +- 10 files changed, 122 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 3a1b37e5339..85991c71e65 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -31,10 +31,11 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = - Get>(kCurReferenceCount); + Get>(kRuntimeReferenceCount); const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + auto &gcs = Get(kGarbageCollector); + const auto &places = Get>(kAllPlaces); ref_cnts = std::vector(vars.size()); @@ -58,7 +59,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs[op->GetScopeIdx()].get(), + std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( @@ -90,6 +91,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( REGISTER_PASS(eager_deletion_pass, paddle::framework::details::EagerDeletionPass) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount) .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index 4838c4198ff..b6b5ad42c46 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -23,6 +23,8 @@ namespace details { OpGraphView::OpGraphView(const std::vector &ops) { Build(ops); } void OpGraphView::Build(const std::vector &ops) { + preceding_ops_.clear(); + pending_ops_.clear(); for (auto &op : ops) { preceding_ops_[op]; pending_ops_[op]; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c096e09800..f2c9dfb5248 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,22 +29,22 @@ namespace paddle { namespace framework { namespace details { -class OpConnectionDetector { +class OpRelationDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpConnectionDetector(const std::vector &all_ops) + explicit OpRelationDetector(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) { - if (op_set.size() <= 1) return op_set; + OpSet MaxNoDepOps(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of::type>::value, - "Key type of OpSet must be or derived of OpHandleBase"); + "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase"); + if (op_set.size() <= 1) return op_set; std::vector ops(op_set.begin(), op_set.end()); OpSet ret; auto rels = GetRelations(ops); @@ -59,7 +59,7 @@ class OpConnectionDetector { private: std::vector> GetRelations( - const std::vector ops) { + const std::vector ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -144,7 +144,7 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); - OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 77846f7bdfc..eb534f97015 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -33,12 +34,13 @@ using ReferenceCountMap = std::unordered_map; using AtomicReferenceCountMap = std::unordered_map>; -using GarbageCollectorList = - std::vector>>; +using GarbageCollectorMap = + std::map>>; -const char kGlobalReferenceCount[] = "reference_count"; -const char kCurReferenceCount[] = "current_reference_count"; +const char kGlobalReferenceCount[] = "global_reference_count"; +const char kRuntimeReferenceCount[] = "runtime_reference_count"; const char kGarbageCollector[] = "garbage_collector"; +const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = std::unordered_map>; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index da5e277f276..b8775fc3291 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -32,15 +32,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( var_infos_(std::move(var_infos)), places_(std::move(places)) { if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); + gc_ = &(Graph().Get(details::kGarbageCollector)); } } void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { if (gc_) { - for (auto &gc : *gc_) { - gc->Wait(); - gc->Reset(); + for (auto &gc_pair : *gc_) { + gc_pair.second->Wait(); + gc_pair.second->Reset(); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 4d52183a205..6086a219e04 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -60,7 +60,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector var_infos_; std::vector places_; - GarbageCollectorList* gc_{nullptr}; + GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f443c2d8cf6..04425a59830 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -56,13 +56,7 @@ static std::unordered_map GetNonPersistableReferenceCounts( type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } + ++ref_cnts[name]; } } }; @@ -79,8 +73,8 @@ ExecutorPrepareContext::ExecutorPrepareContext( const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -443,7 +437,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->cur_ref_cnts_)); + &(ctx->runtime_ref_cnts_)); } } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 412ebd19045..5a040ac6415 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -34,14 +34,14 @@ struct ExecutorPrepareContext { ~ExecutorPrepareContext(); - void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } + void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } const framework::ProgramDesc& prog_; size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map global_ref_cnts_; + std::unordered_map runtime_ref_cnts_; }; class Executor { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3d466e44a19..dfd031f1195 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -51,11 +51,22 @@ class ParallelExecutorPrivate { } } - void ResetRuntimeReferenceCount() { - for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { - for (auto &pair : rt_ref_cnts_[i]) { - rt_cur_ref_cnts_[i][pair.first] = pair.second; + std::unique_ptr PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size); + + inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + + void ResetRuntimeReferenceCount(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) { + for (auto &pair : global_ref_cnts_[i]) { + runtime_ref_cnts_[i][pair.first] = pair.second; + } + + for (auto &fetch_name : fetch_tensors) { + runtime_ref_cnts_[i].erase(fetch_name); } + runtime_ref_cnts_[i].erase(fetched_var_name); } } @@ -71,14 +82,75 @@ class ParallelExecutorPrivate { bool use_cuda_; bool use_all_reduce_; - // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ - std::vector rt_ref_cnts_; - std::vector rt_cur_ref_cnts_; - details::GarbageCollectorList gcs_; + // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and + // then keeps unchanged + // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_ + std::vector global_ref_cnts_; + std::vector runtime_ref_cnts_; + details::GarbageCollectorMap gcs_; }; +std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size) { + for (size_t i = 0; i < places_.size(); ++i) { + auto &place = places_[i]; + if (gcs_.count(place) > 0) { + continue; + } +#ifdef PADDLE_WITH_CUDA + GarbageCollector *gc = nullptr; + if (platform::is_gpu_place(place)) { + if (IsFastEagerDeletionModeEnabled()) { + gc = new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size); + } else { + gc = new StreamGarbageCollector( + boost::get(place), max_memory_size); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + gc = new CPUGarbageCollector( + boost::get(place), max_memory_size); + VLOG(10) << "Created GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA + } +#endif + + if (gc) { + gcs_[place] = std::unique_ptr>(gc); + } + } + + if (gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &global_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount, + &runtime_ref_cnts_); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &gcs_); + } + + return graph; +} + std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -153,54 +225,8 @@ ParallelExecutor::ParallelExecutor( auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - size_t place_num = member_->places_.size(); - for (size_t i = 0; i < place_num; ++i) { - auto &place = member_->places_[i]; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(place)) { - if (IsFastEagerDeletionModeEnabled()) { - member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size)); - } else { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { -#endif - member_->gcs_.emplace_back(new CPUGarbageCollector( - boost::get(place), max_memory_size)); - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#ifdef PADDLE_WITH_CUDA - } -#endif - } - } - - if (!member_->gcs_.empty()) { - std::vector last_live_ops_of_vars; - - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, - &(member_->rt_ref_cnts_)); - ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(std::move(graph)); - VLOG(10) << "ReferenceCountPass Applied"; - - auto eager_deletion_pass = - ir::PassRegistry::Instance().Get("eager_deletion_pass"); - eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, - &(member_->rt_cur_ref_cnts_)); - eager_deletion_pass->SetNotOwned(details::kGarbageCollector, - &(member_->gcs_)); - eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = eager_deletion_pass->Apply(std::move(graph)); - VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. @@ -316,15 +342,8 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); - if (!member_->gcs_.empty()) { - member_->ResetRuntimeReferenceCount(); - size_t n = member_->rt_ref_cnts_.size(); - for (size_t i = 0; i < n; ++i) { - for (auto &fetch_name : fetch_tensors) { - member_->rt_cur_ref_cnts_[i].erase(fetch_name); - } - member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); - } + if (member_->HasGarbageCollectors()) { + member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name); } auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index da7cad82d8d..06920a47ee0 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -74,9 +74,7 @@ class WhileOp : public framework::OperatorBase { bool is_test = Attr("is_test"); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { @@ -144,9 +142,7 @@ class WhileGradOp : public framework::OperatorBase { auto *program = block->Program(); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = @@ -369,7 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The followi_ng codes are used in eager deletion mode */ + /* The following codes are used in eager deletion mode */ std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { std::unordered_set fwd_skip_vars; -- GitLab From 644baa2e45b64f5a52e237ca1981cb30a5043e0c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 0066/2367] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f1195..fd2bcb8848c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( -- GitLab From 8095fb5e686d3e32f1838dfe7fbf4d0108ef1f25 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 0067/2367] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f1195..e51b1f1f73e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( @@ -122,7 +122,7 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( } } - if (gcs_.empty()) { + if (!gcs_.empty()) { std::vector last_live_ops_of_vars; auto ref_cnt_pass = -- GitLab From b653ed05163e9f6d47208d5f46bee18ec57a2645 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 13:53:31 +0800 Subject: [PATCH 0068/2367] add prefetch and remvoe selectedrows of bias --- paddle/fluid/operators/nce_op.cc | 8 +-- paddle/fluid/operators/nce_op.h | 47 ++++----------- python/paddle/fluid/layers/nn.py | 9 ++- .../tests/unittests/test_dist_transpiler.py | 59 +++++++++++++++++-- .../fluid/transpiler/distribute_transpiler.py | 3 +- 5 files changed, 75 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 06ff825fde3..0a0be24a540 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -243,24 +243,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); - auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; block->Var(weight_grad) ->SetType(framework::proto::VarType::SELECTED_ROWS); - block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); - block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); - block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); } }; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index afb14c30713..6567b6534a4 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -297,18 +297,19 @@ class NCEGradKernel : public framework::OpKernel { sample_grad_data[i] *= d_out_data[sample_idx]; } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + bool is_sparse = context.Attr("is_sparse"); if (!is_sparse) { - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T *d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; - } - } // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { @@ -330,34 +331,6 @@ class NCEGradKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto *bias_var = context.InputVar("Bias"); - DDim bias_dim; - if (bias_var->IsType()) { - bias_dim = context.Input("Bias")->dims(); - } else if (bias_var->IsType()) { - auto *table_t = context.Input("Bias"); - bias_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter Bias of a NCE_OP " - "must be either LoDTensor or SelectedRows"); - } - - auto d_bias = - context.Output(framework::GradVarName("Bias")); - d_bias->set_rows(labels); - d_bias->set_height(bias_dim[0]); - - d_bias->mutable_value()->Resize( - {static_cast(labels.size()), bias_dim[1]}); - T *d_bias_data = - d_bias->mutable_value()->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[d_bias->Index(sample_labels_data[i])] += - sample_grad_data[i]; - } - auto *table_var = context.InputVar("Weight"); DDim table_dim; if (table_var->IsType()) { diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28b8ae895af..9401ffc2b12 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -24,7 +24,7 @@ from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ -from .tensor import concat +from .tensor import concat, assign from . import utils from .. import unique_name from functools import reduce @@ -4770,12 +4770,17 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, 'sampler': sampler, - 'is_sparse': is_sparse + 'is_sparse': is_sparse, + 'remote_prefetch': remote_prefetch } helper.append_op( diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 194387bc987..48bac52654a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -14,14 +14,15 @@ from __future__ import print_function +import traceback import math +import collections +import six import unittest +import numpy as np + import paddle.fluid as fluid -from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import traceback -import collections -import six class TranspilerTest(unittest.TestCase): @@ -823,5 +824,55 @@ class TestRemoteLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +# test for remote prefetch +class TestRemoteNce(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 20 + sampler = "uniform" + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_arr.tolist(), + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=5, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 817af602bd5..9c526a0d8e0 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,8 +242,7 @@ class DistributeTranspiler(object): sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( - 'remote_prefetch') is True and not op.attr( - 'is_distributed'): + 'remote_prefetch') is True: sparse_update_ops.append(op) return sparse_update_ops -- GitLab From 575ae7c6c3133df589cfe6c1a9d9e45e6bfc99c5 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 7 Dec 2018 14:30:10 +0800 Subject: [PATCH 0069/2367] refine pslib inferface & fix some bugs --- CMakeLists.txt | 4 +- paddle/fluid/framework/async_executor.cc | 29 +++++-- paddle/fluid/framework/async_executor.h | 6 +- .../fluid/framework/executor_thread_worker.cc | 31 ++++---- paddle/fluid/pybind/async_executor_py.cc | 4 +- python/paddle/fluid/async_executor.py | 11 ++- python/paddle/fluid/distributed/downpour.py | 2 +- python/paddle/fluid/distributed/ps_pb2.py | 78 +++++++++---------- 8 files changed, 100 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fd8dd1dfaf..5b5bf6c5b6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,7 +217,7 @@ include(cupti) include(external/gzstream) endif (NOT WIN32) include(external/libmct) -#include(external/pslib_brpc) +include(external/pslib_brpc) include(external/pslib) if(WITH_DISTRIBUTE) @@ -280,7 +280,7 @@ set(EXTERNAL_LIBS zlib ${PYTHON_LIBRARIES} pslib - #pslib_brpc + pslib_brpc libmct ) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 292b05c5884..7685883dd5e 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -65,18 +65,35 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } -void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { +void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO done + _pslib_ptr->init_server(dist_desc, index);//TODO done + + InitParamConfig(); } -void AsyncExecutor::StartServer() { +void AsyncExecutor::InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { + _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); + _pslib_ptr->init_worker(dist_desc, host_sign_list.data(), node_num, index);//TODO done + InitParamConfig(); - _pslib_ptr->run_server(); +} + +uint64_t AsyncExecutor::StartServer() { + return _pslib_ptr->run_server(); +} + +void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { + _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } void AsyncExecutor::InitParamConfig() { - _param_config.fea_dim = _pslib_ptr->get_param()->trainer_param().sparse_table(0).feature_dim(); //TODO + for (int i = 0; i < _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param_size(); ++i) { + if (_pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).accessor().fea_dim(); //TODO + break; + } + } _param_config.slot_dim = _param_config.fea_dim - 2; //TODO _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); @@ -176,6 +193,7 @@ void AsyncExecutor::PrepareDenseThread() { param.dense_params = &_param_config.dense_variable_name; _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); } @@ -238,6 +256,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, fetch_var_names, root_scope_, thidx, debug); } + // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { threads.push_back( diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 21e4a66fcef..90d6b46b2f9 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -63,9 +63,11 @@ class AsyncExecutor { const std::vector& fetch_names, const bool debug = false); //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); - void ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); //void ConfigWorker() {} - void StartServer(); + uint64_t StartServer(); + void GatherServers(std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index f7c05e400d7..e0ee9c11c90 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -345,9 +345,12 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { if (op->Type().find("sgd") != std::string::npos) { continue; } + if (op->Type().find("lookup_table") != std::string::npos || + op->Type().find("lookup_table_grad") != std::string::npos) { + continue; + } op->Run(*thread_scope_, place_); } - UpdateParams(); } @@ -416,8 +419,8 @@ void AsyncExecutorThreadWorker::UpdateParams() { for (auto i: _param_config->dense_table_id) { PushDense(i); } - int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO - int32_t tmp_push_sparse_wait_times = _param_config->tmp_push_sparse_wait_times; //TODO + int32_t tmp_push_dense_wait_times = -1;//_param_config->tmp_push_dense_wait_times; //TODO + int32_t tmp_push_sparse_wait_times = -1;//_param_config->tmp_push_sparse_wait_times; //TODO static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); @@ -430,7 +433,6 @@ void AsyncExecutorThreadWorker::UpdateParams() { if (tmp_push_dense_wait_times == -1) { _push_dense_status.resize(0); } - if (_push_sparse_status.size() >= push_sparse_wait_times) { for (auto& t : _push_sparse_status) { t.wait(); @@ -440,7 +442,6 @@ void AsyncExecutorThreadWorker::UpdateParams() { if (tmp_push_sparse_wait_times == -1) { _push_sparse_status.resize(0); } - //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO for (auto dense_table_id: _param_config->dense_table_id) { _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); @@ -451,8 +452,8 @@ void AsyncExecutorThreadWorker::UpdateParams() { void AsyncExecutorThreadWorker::PushDense(int table_id) { std::vector regions; //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; - std::vector variables; - for (auto& t : variables) { + //std::vector variables; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { Variable* var = thread_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -469,7 +470,6 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; auto fea_dim = _param_config->fea_dim; //TODO @@ -477,7 +477,6 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.clear(); features.resize(0); features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { @@ -493,14 +492,14 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.push_back(static_cast(ids[i])); } } - check_pull_push_memory(features, feature_value, fea_dim); std::vector pull_feature_value; for (auto i = 0u; i < features.size(); ++i) { pull_feature_value.push_back(feature_value[i].data()); } - + for (int i = 0; i < features.size(); ++i) { + } auto status = _pslib_ptr->_worker_ptr->pull_sparse( pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); @@ -532,10 +531,15 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { LoDTensor* tensor = var->GetMutable(); int64_t* ids = tensor->data(); int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->data(); + float* ptr = tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + //float* ptr = tensor_emb->data(); for (auto index = 0u; index < len; ++index){ //if (_current_train_job.use_cvm_feature()) { @@ -576,7 +580,6 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { //} const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 63fd06224f0..eca46fbad55 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -48,8 +48,10 @@ void BindAsyncExecutor(py::module* m) { new framework::AsyncExecutor(scope, place)); })) .def("run_from_files", &framework::AsyncExecutor::RunFromFile) - .def("config_pslib", &framework::AsyncExecutor::ConfigPslib) + .def("init_server", &framework::AsyncExecutor::InitServer) + .def("init_worker", &framework::AsyncExecutor::InitWorker) .def("start_server", &framework::AsyncExecutor::StartServer) + .def("gather_servers", &framework::AsyncExecutor::GatherServers) .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index c5863eb9e05..f667ff24246 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -158,8 +158,17 @@ class AsyncExecutor(object): return + def init_server(self, filename, index): + self.executor.init_server(filename, index) + + def init_worker(self, filename, ips, nodes_cnt, index): + self.executor.init_worker(filename, ips, nodes_cnt, index) + def start_server(self): - self.executor.start_server() + return self.executor.start_server() + + def gather_servers(self, ips, nodes_cnt): + self.executor.gather_servers(ips, nodes_cnt) def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 093792d5d60..3d940b62b01 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -56,7 +56,7 @@ class DownpourSGD(object): params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - ps_param.worker_param.CopyFrom(worker.get_desc()) + ps_param.trainer_param.CopyFrom(worker.get_desc()) # Todo(guru4elephant): figure out how to support more sparse parameters # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 0ef34d6e189..f33ec50f7d2 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd1\x01\n\x16ServerServiceParameter\x12(\n\x0cserver_class\x18\x01 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsServer\x12(\n\x0c\x63lient_class\x18\x02 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsClient\x12&\n\rservice_class\x18\x03 \x01(\t:\x0f\x41\x62\x61\x63usPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3198, - serialized_end=3250, + serialized_start=3262, + serialized_end=3314, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3253, - serialized_end=3570, + serialized_start=3317, + serialized_end=3634, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3166, - serialized_end=3196, + serialized_start=3230, + serialized_end=3260, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -531,35 +531,35 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( _descriptor.FieldDescriptor( name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusBrpcPsServer").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusBrpcPsClient").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusPsService").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, number=4, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, + has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, number=5, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, + has_default_value=True, default_value=12, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), @@ -576,7 +576,7 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=1134, - serialized_end=1279, + serialized_end=1343, ) @@ -641,8 +641,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1282, - serialized_end=1473, + serialized_start=1346, + serialized_end=1537, ) @@ -721,8 +721,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1476, - serialized_end=1845, + serialized_start=1540, + serialized_end=1909, ) @@ -794,8 +794,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1848, - serialized_end=2054, + serialized_start=1912, + serialized_end=2118, ) @@ -839,8 +839,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2056, - serialized_end=2139, + serialized_start=2120, + serialized_end=2203, ) @@ -898,8 +898,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2141, - serialized_end=2242, + serialized_start=2205, + serialized_end=2306, ) @@ -950,8 +950,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2244, - serialized_end=2363, + serialized_start=2308, + serialized_end=2427, ) @@ -1009,8 +1009,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2366, - serialized_end=2591, + serialized_start=2430, + serialized_end=2655, ) @@ -1068,8 +1068,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2594, - serialized_end=2728, + serialized_start=2658, + serialized_end=2792, ) @@ -1106,8 +1106,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2730, - serialized_end=2796, + serialized_start=2794, + serialized_end=2860, ) @@ -1137,8 +1137,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2798, - serialized_end=2857, + serialized_start=2862, + serialized_end=2921, ) @@ -1168,8 +1168,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2859, - serialized_end=2905, + serialized_start=2923, + serialized_end=2969, ) @@ -1213,8 +1213,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2907, - serialized_end=2980, + serialized_start=2971, + serialized_end=3044, ) @@ -1287,8 +1287,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2983, - serialized_end=3196, + serialized_start=3047, + serialized_end=3260, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER -- GitLab From cb8a24be14f04c23fbc206d8c8537ff365b4e6bc Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:11:16 +0800 Subject: [PATCH 0070/2367] clean code --- .../framework/details/all_reduce_op_handle.cc | 12 +------ .../details/computation_op_handle.cc | 12 ++----- .../fluid/framework/details/op_handle_base.cc | 2 -- .../details/parallel_ssa_graph_executor.cc | 13 +++---- .../details/parallel_ssa_graph_executor.h | 4 +-- paddle/fluid/framework/parallel_executor.cc | 35 ++++--------------- paddle/fluid/platform/nccl_helper.h | 2 +- 7 files changed, 20 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae17ea8a154..ae203387462 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,9 +46,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - int64_t start_ts = GetTS(); - int64_t func_ts = GetTS(); - VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, @@ -62,11 +59,7 @@ void AllReduceOpHandle::RunImpl() { return; // No need to all reduce when GPU count = 1; } else { // Wait input done - start_ts = GetTS(); WaitInputVarGenerated(); - VLOG(5) << "all_reduce_op_handle wait input var spent: " - << GetTS() - start_ts << " (ns)."; - start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -107,8 +100,6 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ - << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -118,6 +109,7 @@ void AllReduceOpHandle::RunImpl() { ncclSum, comm, stream)); }); } + this->RunAndRecordEvent([&] { // TODO(Yancey1989): need allreduce operator to avoid this flag if (nccl_ctxs_->need_group_call_) { @@ -162,8 +154,6 @@ void AllReduceOpHandle::RunImpl() { } } } - VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts - << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 35ba99a8793..7ad1e40c600 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,18 +33,10 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { - int64_t start_ts = GetTS(); - auto varname = DynamicCast(this->Outputs())[0]->name_; + if (is_lock_and_record_event_free_) { run_func(); - VLOG(5) << Name() << "_op_handle: " << varname - << " spent: " << GetTS() - start_ts << " (ns)."; } else { - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index d68d1ce71d0..4914e0a5ad3 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,6 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -125,7 +124,6 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event - VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 72beb74aa48..dfb40721d88 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -21,19 +21,20 @@ namespace details { ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs) + std::vector> &&graphs) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)), - pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + // do not use threadpool for each graph execution. + strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { - std::vector scopes = {local_scopes_[i]}; - std::vector places = {places_[i]}; executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, scopes, places, std::move(graphs_[i]))); + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } + VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index c0ba1577f7f..37784775f03 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -30,7 +30,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs); + std::vector> &&graphs); ~ParallelSSAGraphExecutor() final = default; const ir::Graph &Graph() const override { return *graphs_[0]; } @@ -39,9 +39,9 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { private: ExecutionStrategy strategy_; std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; std::vector> graphs_; - std::unique_ptr<::ThreadPool> pool_; std::vector> executors_; }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff3d76fb01c..186f0cb8034 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -54,7 +54,6 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; - std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -142,6 +141,7 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -222,38 +222,17 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - /** - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - - std::vector places = {member_->places_[i]}; - std::vector scopes = {member_->local_scopes_[i]}; - std::unique_ptr p(new - details::ThreadedSSAGraphExecutor( - exec_strategy, scopes, places, std::move(graphs[i]))); - - member_->executors_.push_back(std::move(p)); - - member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, scopes, std::move(var_infos), places, - std::move(member_->executors_[i]))); - }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, graphs)); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 53de53f43df..23a0222239a 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -105,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1 && nccl_id != nullptr) { + if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); -- GitLab From 97de98cd0a1240eacb573e8d117d0e4b928d82b0 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 7 Dec 2018 16:16:18 +0800 Subject: [PATCH 0071/2367] update bpr_loss op code, test=develop --- paddle/fluid/API.spec | 15 ++- paddle/fluid/operators/bpr_loss_op.cc | 35 ++++--- paddle/fluid/operators/bpr_loss_op.h | 92 +++++++------------ python/paddle/fluid/layers/nn.py | 2 +- .../fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 5 files changed, 65 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e273a852a95..9a90ad4e934 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -70,7 +77,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -176,7 +183,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -188,6 +195,9 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -292,6 +302,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 3e6445dbc26..41f2969e6ca 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -23,19 +23,18 @@ class BprLossOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), - "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("LabelPos"), + "Input(LabelPos) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_Pos_dims = ctx->GetInputDim("Label_Pos"); + auto label_Pos_dims = ctx->GetInputDim("LabelPos"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ( - rank, label_Pos_dims.size(), - "Input(X) and Input(Label_Pos) shall have the same rank."); + PADDLE_ENFORCE_EQ(rank, label_Pos_dims.size(), + "Input(X) and Input(LabelPos) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(label_Pos_dims, 0, rank - 1), - "Input(X) and Input(Label_Pos) shall have the same shape " + "Input(X) and Input(LabelPos) shall have the same shape " "except the last dimension."); auto y_dims = x_dims; @@ -61,25 +60,25 @@ class BprLossGradientOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), - "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("LabelPos"), + "Input(LabelPos) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("Label_Pos"); + auto label_pos_dims = ctx->GetInputDim("LabelPos"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); PADDLE_ENFORCE_EQ(dy_dims.size(), rank, "Input(Y@Grad) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ( label_pos_dims.size(), rank, - "Input(Label_Pos) and Input(X) should have the same rank."); + "Input(LabelPos) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(label_pos_dims, 0, rank - 1), - "The Input(X) and Input(Label_Pos) should have the same " + "The Input(X) and Input(LabelPos) should have the same " "shape except the last dimension."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(dy_dims, 0, rank - 1), @@ -88,7 +87,7 @@ class BprLossGradientOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, "The last dimension of Input(Y@Grad) should be 1."); PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, - " the last dimension of Input(Label_Pos) should be 1."); + " the last dimension of Input(LabelPos) should be 1."); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->ShareLoD("X", framework::GradVarName("X")); } @@ -112,7 +111,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "size is equal to the number of classes. This input is a " "real number."); AddInput( - "Label_Pos", + "LabelPos", "(Tensor), the tensor which represents the ground truth. It has the " "same shape with 'X' except the last dimension. the last dimension " "size is 1."); @@ -121,14 +120,14 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "with 'X' except that the last dimension size is 1. It " "represents the sequence bpr loss."); AddComment(R"DOC( -BprLoss Operator. +Bayesian Personalized Ranking Loss Operator. -This operator belongs to pairwise ranking loss. Label_pos is the desired item. -The loss at a given point in one seesion is defined as: +This operator belongs to pairwise ranking loss. LabelPos is the desired item. +The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ Learn more details by reading paper . +neural networks>(https://arxiv.org/abs/1511.06939) )DOC"); } diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index 4103686de77..ea817bb2391 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -39,22 +39,22 @@ class BprLossOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* labels_Pos = ctx.Input("Label_Pos"); + auto* label_pos = ctx.Input("LabelPos"); auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); int rank = x->dims().size(); Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); - Tensor labels_Pos_2d = framework::ReshapeToMatrix(*labels_Pos, rank - 1); + Tensor labels_Pos_2d = framework::ReshapeToMatrix(*label_pos, rank - 1); Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); - const framework::Tensor* prob = &x_2d; + const framework::Tensor* logits = &x_2d; const framework::Tensor* labels_pos = &labels_Pos_2d; framework::Tensor* out = &y_2d; - const int step_size = prob->dims()[0]; - const int class_num = prob->dims()[1]; - const T* prob_data = prob->data(); + const int step_size = logits->dims()[0]; + const int class_num = logits->dims()[1]; + const T* logits_data = logits->data(); T* loss_data = out->data(); const int64_t* label_pos_data = labels_pos->data(); @@ -68,73 +68,47 @@ class BprLossOpKernel : public framework::OpKernel { if (j == lbl_pos) continue; int index_neg = i * class_num + j; sum += TolerableValue()(-std::log( - 1.0f + TolerableValue()( - std::exp(prob_data[index_neg] - prob_data[index_pos])))); + 1.0f + TolerableValue()(std::exp(logits_data[index_neg] - + logits_data[index_pos])))); } loss_data[i] = -sum / (class_num - 1); } } }; -template -class XeGradFunctor { - public: - XeGradFunctor(T* dx, - const T* dy, // NOLINT - const T* x, // NOLINT - const int64_t* label_pos, // NOLINT - size_t num_classes) - : dx_(dx), - dy_(dy), - x_(x), - label_pos_(label_pos), - num_classes_(num_classes) {} - - HOSTDEVICE void operator()(size_t sample_id) { - for (size_t x_offset = sample_id * num_classes_; - x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = static_cast(0); - } - auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; - for (size_t ni = 0; ni < num_classes_; ni++) { - if (label_pos_[sample_id] == ni) continue; - auto n_index = sample_id * num_classes_ + ni; - auto grad_ = - -dy_[sample_id] / - ((num_classes_ - 1) * - (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); - dx_[p_index] += grad_; - dx_[n_index] -= grad_; - } - } - - private: - T* dx_; - const T* dy_; - const T* x_; - const int64_t* label_pos_; - size_t num_classes_; -}; - template class BprLossGradientOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* label_pos = ctx.Input("Label_Pos"); + auto* label_pos = ctx.Input("LabelPos"); auto* dx = ctx.Output(framework::GradVarName("X")); - T* dx_data = dx->mutable_data(ctx.GetPlace()); - int rank = x->dims().size(); - int64_t class_num = x->dims()[rank - 1]; - XeGradFunctor functor(dx_data, dy->data(), x->data(), - label_pos->data(), - static_cast(class_num)); - platform::ForRange for_range( - ctx.template device_context(), - static_cast(dy->numel())); - for_range(functor); + const int step_size = x->dims()[0]; + const int num_classes_ = x->dims()[1]; + T* dx_ = dx->mutable_data(ctx.GetPlace()); + const T* dy_ = dy->data(); + const T* x_ = x->data(); + const int64_t* label_pos_ = label_pos->data(); + + for (size_t sample_id = 0; sample_id < step_size; sample_id++) { + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; x_offset++) { + dx_[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; + for (size_t ni = 0; ni < num_classes_; ni++) { + if (label_pos_[sample_id] == ni) continue; + auto n_index = sample_id * num_classes_ + ni; + auto grad_ = + -dy_[sample_id] / + ((num_classes_ - 1) * + (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); + dx_[p_index] += grad_; + dx_[n_index] -= grad_; + } + } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 06d7e429ae2..3ba1883999d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1356,7 +1356,7 @@ def bpr_loss(input, label_pos): helper.append_op( type='bpr_loss', inputs={'X': [input], - 'Label_Pos': [label_pos]}, + 'LabelPos': [label_pos]}, outputs={'Y': [out]}) return out diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 2af6461aedb..d137f4a6fbe 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -39,7 +39,7 @@ class TestBprLossOp1(OpTest): sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) bpr_loss_result.append(-sum / (class_num - 1)) bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") - self.inputs = {"X": X, "Label_Pos": label_pos} + self.inputs = {"X": X, "LabelPos": label_pos} self.outputs = {"Y": bpr_loss} def test_check_output(self): -- GitLab From a3381dc740e0e52139cfb9fc36b467cbf108ed64 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 7 Dec 2018 16:29:52 +0800 Subject: [PATCH 0072/2367] update question for api ,test=develop --- python/paddle/fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index d137f4a6fbe..80916f4a828 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -25,7 +25,7 @@ class TestBprLossOp1(OpTest): def setUp(self): self.op_type = "bpr_loss" - batch_size = 4 + batch_size = 40 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') label_pos = np.random.randint( -- GitLab From ab084b6e6caf02439cf2e3147876da76ab1bb604 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 7 Dec 2018 16:36:10 +0800 Subject: [PATCH 0073/2367] Polish code test=develop --- python/paddle/dataset/image.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 645d54a2601..57547f1867a 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -34,14 +34,15 @@ from __future__ import print_function import six import numpy as np -# NOTE(minqiyang): this is an ugly fix for the numpy bug reported here +# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here # https://github.com/numpy/numpy/issues/12497 if six.PY3: import subprocess import sys - import_cv2_proc = subprocess.Popen([sys.executable, "-c", "import cv2"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + import_cv2_proc = subprocess.Popen( + [sys.executable, "-c", "import cv2"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) out, err = import_cv2_proc.communicate() retcode = import_cv2_proc.poll() if retcode != 0: -- GitLab From 220db4f334a06bda1b9967740d9fd96806fc461b Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:38:08 +0800 Subject: [PATCH 0074/2367] clean code --- .../fluid/framework/details/build_strategy.cc | 1 - .../details/multi_devices_graph_pass.cc | 3 -- paddle/fluid/framework/parallel_executor.h | 2 - paddle/fluid/framework/scope.cc | 5 +-- paddle/fluid/framework/threadpool.cc | 15 ++----- paddle/fluid/framework/threadpool.h | 2 +- paddle/fluid/framework/threadpool_test.cc | 44 ------------------- .../fluid/operators/reader/blocking_queue.h | 3 -- .../fluid/operators/reader/buffered_reader.cc | 3 -- .../reader/create_double_buffer_reader_op.cc | 13 +----- 10 files changed, 7 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 04c1061536b..1e1b945f63c 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,7 +118,6 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 1bd238357a7..c16e3006d76 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -329,7 +329,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -366,11 +365,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. - VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); - VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 319701f1eb8..ef09b98b2aa 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,8 +20,6 @@ limitations under the License. */ #include #include -#include "ThreadPool.h" - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 873f68e42e4..0d261dd7ccc 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,10 +58,7 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { - VLOG(5) << "~Scope()"; - DropKids(); -} +Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 7dc7430c55b..d34f826c1ab 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,18 +48,9 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (int i = 0; i < num_threads; ++i) { - // for (auto& thread : threads_) { + for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - threads_[i].reset( - new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); - /** - sched_param sch; - int policy; - pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); - if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { - VLOG(1) << "Failed to setschedparam: " << errno; - }**/ + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); } } @@ -77,7 +68,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop(int i) { +void ThreadPool::TaskLoop() { while (true) { Task task; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index bd8c3cdee8c..5177b7ee029 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(int i); + void TaskLoop(); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 1257a76e3e6..884d61e2342 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,47 +59,3 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } -static int64_t GetTS() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000000 + tp.tv_usec; -} - -void multi_call(std::function call) { - for (int i = 0; i < 500; ++i) { - call(); - } -} - -TEST(ThreadPool, PERFORMANCE) { - auto sum = [] { - int a = 0; - for (int i = 0; i < 1000; ++i) { - a += i; - } - }; - // framework::ThreadPool *pool = new framework::ThreadPool(2); - int64_t start = GetTS(); - for (int i = 0; i < 1000; ++i) { - // int64_t s = GetTS(); - framework::Async(std::move(sum)); - // pool->Run(std::move(sum)); - // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; - } - VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; - start = GetTS(); - for (int i = 0; i < 1000; ++i) { - sum(); - } - VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; - std::vector threads; - start = GetTS(); - for (int i = 0; i < 2; ++i) { - std::thread t(multi_call, std::ref(sum)); - threads.push_back(std::move(t)); - } - for (auto& thread : threads) { - thread.join(); - } - VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; -} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 10de11bfa5c..51b980acb5a 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,12 +67,9 @@ class BlockingQueue { } bool Receive(T* elem) { - VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); - VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { - if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 2d66000f1f8..cfa192f8e17 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -82,13 +82,11 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { - VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); - VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -105,7 +103,6 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; - VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 924c92e0bfd..954fec0fbcf 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,15 +25,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - VLOG(1) << "find var in scope: " << &scope; - auto* out_var = scope.FindVar(Output("Out")); - VLOG(1) << "var " << Output("Out") << " -> " << out_var; - auto* out = out_var->GetMutable(); - - // auto* out = scope.Var(Output("Out")) - // ->template GetMutable(); + auto* out = scope.Var(Output("Out")) + ->template GetMutable(); if (out->Get() != nullptr) { - VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -52,11 +46,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); - VLOG(1) << "Reset Buffered Reader in var: " - << scope.FindVar(Input("UnderlyingReader")); } }; -- GitLab From 73edf1376758b753ca7226cc22c442ef2f6c575d Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:41:53 +0800 Subject: [PATCH 0075/2367] update --- paddle/fluid/operators/reader/create_double_buffer_reader_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 954fec0fbcf..440b16cf915 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,7 +25,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.Var(Output("Out")) + auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); if (out->Get() != nullptr) { return; -- GitLab From 47740ace289721e61489f6b2b5c196f26250aa3f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 17:18:45 +0800 Subject: [PATCH 0076/2367] fix performance --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae203387462..6b7bbf9003a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,6 +107,7 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); + if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); }); } -- GitLab From 527946df490df1ad80152ffdc973178b9ae308f6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 18:08:29 +0800 Subject: [PATCH 0077/2367] add scope in prefetch --- .../distributed/parameter_prefetch.cc | 19 +++++++------- paddle/fluid/operators/lookup_table_op.h | 3 ++- paddle/fluid/operators/nce_op.h | 25 ++++++++++++++----- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index cf14538b1c2..67b56bd2180 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,9 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, framework::Scope* scope, - platform::DeviceContext* actual_ctx) { + const framework::ExecutionContext& context, + const framework::Scope& actual_scope, framework::Scope* scope, + platform::DeviceContext* actual_ctx, ) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -114,9 +115,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope->FindVar(id_name)->Get(); + auto& id_tensor = actual_scope.FindVar(id_name)->Get(); auto* out_tensor = - scope->FindVar(out_name)->GetMutable(); + actual_scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -172,8 +173,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); + const framework::ExecutionContext& context, + const framework::Scope& scope) { + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -245,9 +247,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, &local_scope, &actual_ctx); - - context.scope().DeleteScope(&local_scope); + context, scope, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3a73a7637c6..a7d0fd4856e 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 6567b6534a4..9789e303889 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,18 +170,31 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - local_scope.Var("Ids"); - local_scope.Var("Weight"); + auto *ids = local_scope.Var("Ids"); + auto *x_tensor = ids->GetMutable(); + x_tensor->mutable_data( + framework::make_ddim({static_cast(labels.size()), 1}), + context.GetPlace()); + // copy. + std::memcpy(x_tensor->data(), labels.data(), + labels.size() * sizeof(int64_t)); + + local_scope.Var("Weight@Local") + ->GetMutable() + ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight", table_names, epmap, - height_sections, context); + operators::distributed::prefetch("Ids", "Weight@Local", table_names, + epmap, height_sections, context, + &local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " "parameter prefetch!"); +#endif - auto weight_mat = EigenMatrix::From(*(weight->Get())); + auto weight_mat = EigenMatrix::From( + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -196,7 +209,7 @@ class NCEKernel : public framework::OpKernel { } context.scope().DeleteScope(&local_scope); -#endif + } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); -- GitLab From eb8252466b11bdbea7abca6fd4cc5816f1c30830 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 09:15:23 +0000 Subject: [PATCH 0078/2367] polish code add unittest model containing while_op remove unnecessary codes test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/details/CMakeLists.txt | 5 +- .../details/eager_deletion_op_handle.cc | 48 +++--- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 18 +- .../fluid/framework/details/op_graph_view.cc | 1 + .../framework/details/reference_count_pass.cc | 156 +++++++++++------- .../details/reference_count_pass_helper.cc | 21 +++ .../details/reference_count_pass_helper.h | 4 +- .../scope_buffered_ssa_graph_executor.cc | 21 +-- .../scope_buffered_ssa_graph_executor.h | 6 - paddle/fluid/framework/executor.cc | 56 ++++--- paddle/fluid/framework/garbage_collector.cc | 89 ++++++++++ paddle/fluid/framework/garbage_collector.h | 153 ++++++----------- paddle/fluid/framework/parallel_executor.cc | 28 ++-- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/tensor.h | 4 + .../unittests/test_eager_deletion_gru_net.py | 49 ++++++ .../unittests/test_eager_deletion_lstm_net.py | 111 +++++++++++++ 19 files changed, 516 insertions(+), 268 deletions(-) create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.cc create mode 100644 paddle/fluid/framework/garbage_collector.cc create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad630..f2361c5ceaa 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) + cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -164,7 +166,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8049f5d3f77..a6c8ef408a8 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,9 +33,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 54715fed8d9..3b27415e431 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -26,8 +26,8 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::unordered_set &var_names, - GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + const std::unordered_set &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) : OpHandleBase(node), scope_(scope), var_names_(var_names), @@ -35,9 +35,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - dev_ctx_ = static_cast( + dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); - if (dynamic_cast *>(gc_)) { + if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard( boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -61,10 +61,11 @@ std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; + std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) { + // Var not found, not reference count has not decreased to 0 + if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } @@ -73,43 +74,44 @@ void EagerDeletionOpHandle::RunImpl() { continue; } + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()); - } + garbages.emplace_back(var->GetMutable()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()->mutable_value()); - } + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - auto *tensor_arr = var->GetMutable(); - for (auto &t : *tensor_arr) { - tensors.emplace_back(&t); - } + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } - if (!tensors.empty()) { - ClearTensors(tensors); + if (!garbages.empty()) { + ClearGarbages(&garbages); } } -void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +void EagerDeletionOpHandle::ClearGarbages( + std::deque> *garbages) { #ifdef PADDLE_WITH_CUDA if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = - static_cast *>(gc_)->stream(); + reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); }; - gc_->Add(tensors, callback_func); + gc_->Add(std::move(*garbages), callback_func); } else { #endif - gc_->Add(tensors); + gc_->Add(std::move(*garbages)); #ifdef PADDLE_WITH_CUDA } #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index d8de59cc4de..64867afad5b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -14,8 +14,8 @@ #pragma once +#include #include -#include #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" @@ -30,7 +30,7 @@ class EagerDeletionOpHandle : public OpHandleBase { EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, const std::unordered_set &var_names, - GarbageCollector *gc, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); ~EagerDeletionOpHandle(); @@ -41,11 +41,11 @@ class EagerDeletionOpHandle : public OpHandleBase { void RunImpl() override; private: - void ClearTensors(const std::vector &tensors); + void ClearGarbages(std::deque> *garbages); const Scope *scope_; std::unordered_set var_names_; - GarbageCollector *gc_; // not own + GarbageCollector *gc_; // not own AtomicReferenceCountMap *ref_cnts_; // not own #ifdef PADDLE_WITH_CUDA platform::CUDADeviceContext *dev_ctx_{nullptr}; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 85991c71e65..4e42d0b4972 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -28,17 +28,21 @@ namespace details { std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - const auto &vars = graph->Get(kGraphVars); - auto &ref_cnts = Get>(kRuntimeReferenceCount); + PADDLE_ENFORCE(ref_cnts.empty(), + "kRuntimeReferenceCount should be initialized here!"); + + const auto &vars = graph->Get(kGraphVars); + ref_cnts.resize(vars.size()); + const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + const auto &gcs = Get(kGarbageCollector); const auto &places = Get>(kAllPlaces); - ref_cnts = std::vector(vars.size()); - + // a reverse map of last_live_ops + // i.e., last op --> variable names which can be deleted. std::unordered_map> op_vars_map; @@ -58,8 +62,8 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( auto *eager_deletion_node = graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), + eager_deletion_node, op->GetScope(), op->GetPlace(), var_names, + gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index b6b5ad42c46..d3865c2c291 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -42,6 +42,7 @@ void OpGraphView::Build(const std::vector &ops) { std::unordered_set OpGraphView::AllOps() const { std::unordered_set ret; + ret.reserve(preceding_ops_.size()); for (auto &pair : preceding_ops_) { ret.insert(pair.first); } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f2c9dfb5248..13a042d8e6e 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,15 +29,17 @@ namespace paddle { namespace framework { namespace details { -class OpRelationDetector { - public: +// A functor to shrink/remove operators who depend on other operators in a set +class ShrinkDepsOpFunctor { + private: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpRelationDetector(const std::vector &all_ops) + public: + explicit ShrinkDepsOpFunctor(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) const { + OpSet operator()(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of(ops[i])); + ret.emplace(static_cast(ops[i])); } } return ret; @@ -59,7 +61,7 @@ class OpRelationDetector { private: std::vector> GetRelations( - const std::vector ops) const { + const std::vector &ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -112,6 +114,10 @@ class OpRelationDetector { const OpGraphView graph_; }; +/** + * Find the nearest downstream computation op handle. If the op is a + * computation op, just return itself. + */ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -134,33 +140,87 @@ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( return nullptr; } +static std::unordered_set +ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, + const ShrinkDepsOpFunctor &shrink_func, + bool *ok) { + // stage one. Get last op for variable. + std::unordered_set candidates; + { + if (var->PendingOps().empty() && var->GeneratedOp()) { + // No operator depends on this variable. So the last operator is the op + // who generates this variable. + candidates.emplace(var->GeneratedOp()); + } else { + candidates = var->PendingOps(); + } + + // No pending ops or generated op is nullptr + if (candidates.empty()) { + *ok = false; + return {}; + } + } + + // stage two. Try to cast them to computation op. + // return (*ok=false) when failed. + // + // The reason why we cannot make any types of op handle to be the last lived + // op is: + // some op handle may operate on many DeviceContext, however, our garbage + // collector can only wait one DeviceContext for now. So currently, we wait + // the nearest compute op. + std::unordered_set computation_op; + { + for (auto *op : candidates) { + auto *compute_op = + FindNextComputationOpHandleOrReturnItself(op, scope_idx); + if (compute_op == nullptr) { + *ok = false; + return {}; + } + computation_op.emplace(compute_op); + } + } + + // stage three. Try to shrink computation op if they depend on each other. + // Get the smallest set of the most ops. + *ok = true; + return shrink_func(computation_op); +} + +static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kGlobalReferenceCount); auto &last_live_ops_of_vars = Get>(kLastLiveOpsOfVars); - last_live_ops_of_vars = std::vector(vars.size()); - ref_cnts = std::vector(vars.size()); + PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(), + "Last Live Ops and Reference Counts of vars should be " + "initialized at here."); - OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); + const auto &vars = graph->Get(kGraphVars); - for (size_t i = 0; i < vars.size(); ++i) { - for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) { - continue; - } + last_live_ops_of_vars.resize(vars.size()); + ref_cnts.resize(vars.size()); - const std::string &var_name = name_var_pair.first; - auto *last_ver_var = name_var_pair.second.back(); + ShrinkDepsOpFunctor shrink_func( + ir::FilterByNodeWrapper(*graph)); - VarDesc *var_desc = nullptr; - std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), - [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + // Whether this variable can be reused or deleted? If not, we do not + // compute reference counts and dependencies. + VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); if (var_desc == nullptr || var_desc->Persistable()) { continue; @@ -170,50 +230,20 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( if (var_type != proto::VarType::LOD_TENSOR && var_type != proto::VarType::SELECTED_ROWS && var_type != proto::VarType::LOD_TENSOR_ARRAY) { + // Var type cannot be deleted continue; } - std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) -> bool { - auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); - if (compute_op) { - last_live_op.insert(compute_op); - return true; - } else { - return false; - } - }; - - bool can_delete = false; - auto &pending_ops = last_ver_var->PendingOps(); - if (pending_ops.empty()) { - auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op && add_last_live_op(generated_op)) { - can_delete = true; - } - } else { - can_delete = true; - for (auto *pending_op : pending_ops) { - if (!add_last_live_op(pending_op)) { - can_delete = false; - break; - } - } - } - - if (can_delete) { - size_t original_size = last_live_op.size(); - last_live_op = detector.MaxNoDepOps(last_live_op); - if (last_live_op.size() != original_size) { - VLOG(10) << "Shrink last living op number of " << var_name << " from " - << original_size << " to " << last_live_op.size(); - } - - PADDLE_ENFORCE(!last_live_op.empty(), - "Last living ops of %s cannot be empty", var_name); + bool ok; + auto result = ExtractComputationOpFromLastLivedVar( + name_var_pair.second.back(), i, shrink_func, &ok); - ref_cnts[i].emplace(var_name, last_live_op.size()); - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (ok) { + auto &var_name = name_var_pair.first; + PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", + var_name); + ref_cnts[i].emplace(var_name, result.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(result)); } } } diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc new file mode 100644 index 00000000000..89bd08c2d04 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +namespace details {} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index eb534f97015..1c083dbf001 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -18,10 +18,10 @@ #include #include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace framework { @@ -35,7 +35,7 @@ using AtomicReferenceCountMap = std::unordered_map>; using GarbageCollectorMap = - std::map>>; + std::map>; const char kGlobalReferenceCount[] = "global_reference_count"; const char kRuntimeReferenceCount[] = "runtime_reference_count"; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index b8775fc3291..57f6fc66c57 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -30,20 +30,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) { - if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); - } -} - -void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { - if (gc_) { - for (auto &gc_pair : *gc_) { - gc_pair.second->Wait(); - gc_pair.second->Reset(); - } - } -} + places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -83,19 +70,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto &p : places_) { + for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } - } else { - WaitAllGarbageCollectors(); } - if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 6086a219e04..5e87e0bf50b 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,11 +21,9 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" - namespace paddle { namespace framework { namespace details { @@ -50,8 +48,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: - void WaitAllGarbageCollectors(); - size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; @@ -59,8 +55,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; - - GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 04425a59830..767bbb524f4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor.h" +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -83,31 +84,37 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { } static void DeleteUnusedTensors( - const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, std::unordered_map* ref_cnts) { - std::unordered_set erase_tensors; + std::deque> garbages; auto handler = [&](const VariableNameMap& name_map) { for (auto& name_pair : name_map) { for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if (--(it->second) == 0) { - auto* var = scope.FindVar(name); - if (var != nullptr) { - VLOG(2) << "Erase tensor \'" << name << "\'"; - if (var->IsType()) { - erase_tensors.insert(var->GetMutable()); - } else if (var->IsType()) { - erase_tensors.insert( - var->GetMutable()->mutable_value()); - } else if (var->IsType()) { - auto* lod_tensor_arr = var->GetMutable(); - for (auto& t : *lod_tensor_arr) { - erase_tensors.insert(&t); - } - } + if (--(it->second) != 0) { + continue; + } + auto* var = scope.FindVar(name); + if (var != nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemory()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } } @@ -116,8 +123,8 @@ static void DeleteUnusedTensors( handler(op->Inputs()); handler(op->Outputs()); - if (!erase_tensors.empty()) { - gc->Add(erase_tensors); + if (!garbages.empty()) { + gc->Add(std::move(garbages)); } } @@ -411,22 +418,22 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; + std::unique_ptr gc; if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( + gc.reset(new UnsafeFastGPUGarbageCollector( boost::get(place_), max_memory_size)); } else { - gc.reset(new DefaultStreamGarbageCollector( + gc.reset(new DefaultStreamGarbageCollector( boost::get(place_), max_memory_size)); } } else if (platform::is_cpu_place(place_)) { #endif - gc.reset(new CPUGarbageCollector( - boost::get(place_), max_memory_size)); + gc.reset(new CPUGarbageCollector(boost::get(place_), + max_memory_size)); #ifdef PADDLE_WITH_CUDA } #endif @@ -442,7 +449,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } platform::DeviceContextPool::Instance().Get(place_)->Wait(); - if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc new file mode 100644 index 00000000000..54d9d0dc018 --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +GarbageCollector::GarbageCollector(const platform::Place &place, + size_t max_memory_size) + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { + garbages_.reset(new GarbageQueue()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); +} + +CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} + +#ifdef PADDLE_WITH_CUDA +UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void UnsafeFastGPUGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void DefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void DefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); +} + +StreamGarbageCollector::~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +cudaStream_t StreamGarbageCollector::stream() const { return stream_; } + +void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void StreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 1382e0d4618..2768671029c 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -14,160 +14,83 @@ #pragma once -#include #include #include #include #include // NOLINT -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { -// T should have memory_size() and clear() method -template class GarbageCollector { public: - GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { - garbages_.reset(new std::deque()); - dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); - } + using GarbageQueue = std::deque>; - virtual ~GarbageCollector() {} + GarbageCollector(const platform::Place &place, size_t max_memory_size); - size_t NumOfGarbages() const { - std::lock_guard guard(mutex_); - return garbages_->size(); - } + virtual ~GarbageCollector() = default; - void Reset() { - std::lock_guard guard(mutex_); - garbages_.reset(new std::deque()); - cur_memory_size_ = 0; - } + virtual void Wait() const {} template - void Add(const Container &objs) { - Add(objs, []() {}); - } + void Add(Container &&objs); template - void Add(const Container &objs, Callback &&callback) { - std::deque *clear_deque = nullptr; - { - std::lock_guard guard(mutex_); - for (auto *obj : objs) { - garbages_->push_back(obj); - cur_memory_size_ += obj->memory_size(); - } - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - clear_deque = garbages_.release(); - garbages_.reset(new std::deque()); - } - } - - if (clear_deque != nullptr) { - callback(); - ClearCallback([clear_deque]() { - for (auto *obj : *clear_deque) obj->clear(); - delete clear_deque; - }); - } - } - - virtual void Wait() const {} + void Add(Container &&objs, Callback &&callback); protected: virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::unique_ptr> garbages_; + std::unique_ptr garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; - size_t cur_memory_size_ = 0; + size_t cur_memory_size_{0}; }; -template -class CPUGarbageCollector : public GarbageCollector { +class CPUGarbageCollector : public GarbageCollector { public: - CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; #ifdef PADDLE_WITH_CUDA -template -class UnsafeFastGPUGarbageCollector : public GarbageCollector { +class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; -template -class DefaultStreamGarbageCollector : public GarbageCollector { +class DefaultStreamGarbageCollector : public GarbageCollector { public: DefaultStreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); - cudaStream_t stream() const { - return static_cast(this->dev_ctx_) - ->stream(); - } - - void Wait() const override { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); - } + void Wait() const override; protected: - void ClearCallback(const std::function &callback) override { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); - } + void ClearCallback(const std::function &callback) override; }; -template -class StreamGarbageCollector : public GarbageCollector { +class StreamGarbageCollector : public GarbageCollector { public: StreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) { - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamCreate(&stream_)); - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); - } + size_t max_memory_size); - ~StreamGarbageCollector() { - auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaStreamDestroy(stream_)); - } + ~StreamGarbageCollector(); - void Wait() const override { callback_manager_->Wait(); } + void Wait() const override; - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const; protected: - void ClearCallback(const std::function &callback) override { - callback_manager_->AddCallback(callback); - } + void ClearCallback(const std::function &callback) override; private: cudaStream_t stream_; @@ -175,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector { }; #endif +template +void GarbageCollector::Add(Container &&objs) { + Add(std::forward(objs), []() {}); +} + +template +void GarbageCollector::Add(Container &&objs, Callback &&callback) { + GarbageQueue *garbage_queue = nullptr; + { + std::lock_guard guard(mutex_); + for (auto &obj : objs) { + if (!obj) continue; + cur_memory_size_ += obj->size(); + garbages_->push_back(std::move(obj)); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + garbage_queue = garbages_.release(); + garbages_.reset(new GarbageQueue()); + } + } + + if (garbage_queue) { + callback(); + ClearCallback([garbage_queue]() { delete garbage_queue; }); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e51b1f1f73e..7458b69af8c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,29 +97,31 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } - GarbageCollector *gc = nullptr; + std::unique_ptr gc; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { - gc = new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); } else { - gc = new StreamGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new StreamGarbageCollector( + boost::get(place), max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { + } else { #endif - gc = new CPUGarbageCollector( - boost::get(place), max_memory_size); - VLOG(10) << "Created GarbageCollector at " << place; + if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector(boost::get(place), + max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW("Unsupported place for garbage collection"); + } #ifdef PADDLE_WITH_CUDA } #endif - if (gc) { - gcs_[place] = std::unique_ptr>(gc); - } + gcs_.emplace(place, std::move(gc)); } if (!gcs_.empty()) { @@ -144,8 +146,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &gcs_); } return graph; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index cb3b6cdc3ee..6fa5e99f9f3 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,7 +38,7 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); -DEFINE_bool(fast_eager_deletion_mode, true, +DEFINE_bool(fast_eager_deletion_mode, false, "Fast eager deletion mode. If enabled, memory would release " "immediately without waiting GPU kernel ends."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b6..9f7027f5ae8 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,6 +158,10 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } + std::shared_ptr MoveMemory() { + return std::move(holder_); + } + private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py new file mode 100644 index 00000000000..1ec174544cb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -0,0 +1,49 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_eager_deletion_lstm_net import TestBase +import paddle.fluid as fluid + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class GRUTest(TestBase): + def setUp(self): + self.net = gru_net + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py new file mode 100644 index 00000000000..431765bff2d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -0,0 +1,111 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = lstm_net + + def test_network(self): + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) + + +if __name__ == "__main__": + unittest.main() -- GitLab From f1fb64b17fb0290e7e1f110069de19b0ea0d0474 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 7 Dec 2018 21:52:27 +0800 Subject: [PATCH 0079/2367] Add reduce sparse tensor feature. (#14757) --- paddle/fluid/framework/details/CMakeLists.txt | 16 +- .../fluid/framework/details/build_strategy.cc | 19 ++- .../fluid/framework/details/build_strategy.h | 2 + .../framework/details/reduce_and_gather.h | 2 +- .../framework/details/reduce_op_handle.cc | 144 +++++++++++++++++- .../framework/details/reduce_op_handle.h | 39 +++++ .../operators/distributed/CMakeLists.txt | 14 +- .../distributed/collective_client.cc | 59 +++++++ .../operators/distributed/collective_client.h | 93 +++++++++++ .../distributed/collective_server.cc | 74 +++++++++ .../operators/distributed/collective_server.h | 110 +++++++++++++ .../distributed/collective_server_test.cc | 115 ++++++++++++++ .../operators/distributed/grpc_client.cc | 59 ++++++- .../fluid/operators/distributed/grpc_client.h | 24 ++- .../operators/distributed/grpc_server.cc | 102 ++++++++++++- .../operators/distributed/grpc_service.h | 8 +- .../operators/distributed/request_handler.h | 2 + .../fluid/operators/distributed/rpc_client.h | 12 +- .../fluid/operators/distributed/rpc_server.cc | 90 +++++++++++ .../fluid/operators/distributed/rpc_server.h | 31 ++++ .../operators/distributed/send_recv.proto.in | 3 + paddle/fluid/operators/math/softmax_impl.h | 1 + paddle/fluid/pybind/pybind.cc | 12 ++ python/paddle/fluid/framework.py | 1 + python/paddle/fluid/parallel_executor.py | 8 + .../fluid/transpiler/distribute_transpiler.py | 1 + 26 files changed, 1013 insertions(+), 28 deletions(-) create mode 100644 paddle/fluid/operators/distributed/collective_client.cc create mode 100644 paddle/fluid/operators/distributed/collective_client.h create mode 100644 paddle/fluid/operators/distributed/collective_server.cc create mode 100644 paddle/fluid/operators/distributed/collective_server.h create mode 100644 paddle/fluid/operators/distributed/collective_server_test.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fea..2f76cb714fc 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) - nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) + if(WITH_DISTRIBUTE) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + else() + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + endif() nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) - cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) + if(WITH_DISTRIBUTE) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor sendrecvop_grpc) + else() + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor) + endif() cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1e1b945f63c..d8526b3f249 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -58,6 +58,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + CollectiveContext *context = CollectiveContext::GetInstance(); + context->endpoints_ = strategy_.trainers_endpoints_; + context->trainer_id_ = strategy_.trainer_id_; + PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); + if (strategy_.trainer_id_ > 0) { + PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < + strategy_.trainers_endpoints_.size(), + "trainer_id_ < endpoints_ size"); + } + VLOG(1) << "CollectiveContext:" << context->String(); + // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -135,16 +146,16 @@ std::unique_ptr BuildStrategy::Apply( pass->SetNotOwned("nccl_ctxs", nctx); #endif } else if (pass->Type() == "sequential_execution_pass") { - VLOG(1) << "set enable_sequential_execution:" - << enable_sequential_execution_; + LOG(INFO) << "set enable_sequential_execution:" + << enable_sequential_execution_; pass->Erase(kAllOpDescs); pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "all_reduce_deps_pass") { - VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) - << ", num_trainers:" << num_trainers_; + LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) + << ", num_trainers:" << num_trainers_; pass->Erase(kAllOpDescs); pass->Set>( diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9f0a2591288..c97be169575 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,6 +74,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; int num_trainers_{1}; + int trainer_id_{0}; + std::vector trainers_endpoints_; bool remove_unnecessary_lock_{false}; // NOTE: diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index bd6153c0c73..2e5256fbd49 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -53,7 +53,7 @@ struct ReduceLoDTensor { } }; -inline void GatherSelectedRows( +inline void GatherLocalSelectedRows( const std::vector &src_selecte_rows_, const std::vector &in_places, const std::map &dev_ctxes, diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index c9f1107aeab..cb864848b93 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,6 +16,12 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#endif +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool( @@ -26,6 +32,112 @@ namespace paddle { namespace framework { namespace details { +std::once_flag CollectiveContext::init_flag_; +std::unique_ptr CollectiveContext::context_; + +static inline std::string GetRemoteVarName(const std::string &var_name, + int trainer_id) { + return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id); +} + +void ReduceOpHandle::Wait( + const std::map &dev_ctxes) { + // TODO(gongwb): use event wait? + for (auto &dev_ctx : dev_ctxes) { + dev_ctx.second->Wait(); + } +} + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +template +void ReduceOpHandle::GatherSelectedRows( + const std::vector &src_selected_rows, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selected_rows) { + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + + // 1. gather local selected rows, merge them + std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp"; + auto scope = local_scopes_.at(out_var_handle->scope_idx_); + auto gathered_var_mid = scope->Var(gathered_var_name); + auto gathered_select_rows = + gathered_var_mid->GetMutable(); + GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, + gathered_select_rows); + // FIXME(gongwb): remove this Wait. + Wait(dev_ctxes); + + // merge them + auto merged_dev_ctx = dynamic_cast(dev_ctxes.at(out_place)); + std::string merged_var_name = + GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_); + auto merged_select_rows = + scope->Var(merged_var_name)->GetMutable(); + operators::math::scatter::MergeAdd merge_func; + merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows); + + // 2. start collective server if it doesn't exist + operators::distributed::CollectiveServer *server = + operators::distributed::CollectiveServer::GetInstance( + collective_context.endpoints_[collective_context.trainer_id_], + collective_context.endpoints_.size() - 1); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar(merged_var_name, + operators::distributed::kRequestGetMonomerVariable, + scope, merged_dev_ctx); + + // 3. gather them from all remote nodes. + std::vector remote; + operators::distributed::CollectiveClient *client = + operators::distributed::CollectiveClient::GetInstance(); + + std::vector vars; + for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) { + if (i == (unsigned)collective_context.trainer_id_) continue; + + operators::distributed::RemoteVar var; + var.trainer_id_ = i; + var.var_name_ = GetRemoteVarName(out_var_handle->name_, i); + var.ep_ = collective_context.endpoints_[i]; + + vars.push_back(var); + VLOG(4) << "gather from:" << var.String(); + } + + // erase gathered vars + merged_dev_ctx->Wait(); + scope->EraseVars(std::vector{gathered_var_name}); + + PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); + PADDLE_ENFORCE(remote.size() == vars.size()); + + // 4. merged local selected rows. + std::vector all; + all.resize(collective_context.endpoints_.size()); + for (auto v : vars) { + all[v.trainer_id_] = + scope->FindVar(v.var_name_)->GetMutable(); + } + all[collective_context.trainer_id_] = merged_select_rows; + + merge_func(*merged_dev_ctx, all, dst_selected_rows); + + rpc_server->WaitVarBarrier(merged_var_name); + rpc_server->ClearVar(merged_var_name); + + // 5. clear mid vars + std::vector tmp_vars{merged_var_name}; + for (auto r : vars) { + tmp_vars.push_back(r.var_name_); + } + scope->EraseVars(tmp_vars); +} +#endif + void ReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); @@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() { this->RunAndRecordEvent([&] { std::vector in_selected_rows = GetInputValues(in_var_handles, var_scopes); - GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p, - out_var->GetMutable()); + + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + VLOG(10) << "GatherSelectedRows CollectiveContext:" + << collective_context.String(); + + // TODO(gongwb): add cpu support + if (collective_context.endpoints_.size() <= 1 || + is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { + GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, + t_out_p, + out_var->GetMutable()); + return; + } + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + if (framework::IsType(in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else if (framework::IsType( + in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else { + PADDLE_ENFORCE(false, + "only support double or float when gahter SelectedRows"); + } +#endif }); } else { std::vector lod_tensors = diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 846839029ca..5491f00f45e 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -30,6 +30,32 @@ namespace paddle { namespace framework { namespace details { +struct CollectiveContext { + std::vector endpoints_; + int trainer_id_{0}; + + std::string String() const { + std::stringstream ss; + ss << "endpoints_:"; + for (auto e : endpoints_) { + ss << e << ","; + } + + ss << "trainer_id_:" << trainer_id_; + + return ss.str(); + } + + static CollectiveContext *GetInstance() { + std::call_once(init_flag_, + [&]() { context_.reset(new CollectiveContext()); }); + return context_.get(); + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr context_; +}; struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; @@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase { protected: void RunImpl() override; +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + template + void GatherSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selecte_rows); +#endif + + void Wait( + const std::map &dev_ctxes); + template std::vector GetInputValues( const std::vector &in_var_handles, diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 36979de68f3..101dbe9c896 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows_functor memory) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + selected_rows_functor scope math_function SERIAL) + endif() + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) else() set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc new file mode 100644 index 00000000000..6d3f5343111 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // NOLINT +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/collective_client.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { +std::once_flag CollectiveClient::init_flag_; +std::unique_ptr CollectiveClient::client_(nullptr); + +bool CollectiveClient::Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, + framework::Scope* scope, int64_t time_out) { + for (auto r : remote_vars) { + VLOG(50) << "begin gather from ep:" << r.String(); + scope->Var(r.var_name_)->GetMutable(); + VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( + r.ep_, ctx, *scope, r.var_name_, time_out); + } + + rpc_client_->Wait(); + + for (auto r : remote_vars) { + auto select_rows = + scope->FindVar(r.var_name_)->GetMutable(); + dst->push_back(select_rows); + + VLOG(4) << "gather from ep:" << r.String() + << ", select_rows:" << GetSelectedRowsInfo(*select_rows); + + rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); + } + + rpc_client_->Wait(); + return true; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h new file mode 100644 index 00000000000..53b03c531a2 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { + std::stringstream ss; + ss << ", height:" << slr.height() << ", rows:["; + for (unsigned int i = 0; i < slr.rows().size(); i++) { + if (i != slr.rows().size() - 1) { + ss << slr.rows()[i] << ","; + } else { + ss << slr.rows()[i]; + } + } + ss << "], dims:" << slr.value().dims(); + return ss.str(); +} + +struct RemoteVar { + std::string ep_; + std::string var_name_; + int trainer_id_{0}; + + std::string String() { + std::stringstream ss; + ss << "ep:" << ep_ << ", var_name:" << var_name_ + << ", trainer_id:" << trainer_id_; + + return ss.str(); + } +}; + +class CollectiveClient { + public: + CollectiveClient() { + rpc_client_.reset(new RPCCLIENT_T()); + rpc_client_->InitImpl(); + } + virtual ~CollectiveClient() {} + + // note this function will retain the rank order. + bool Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, framework::Scope* scope, + int64_t time_out = FLAGS_rpc_deadline); + + static CollectiveClient* GetInstance() { + std::call_once(init_flag_, [&]() { + if (client_.get() == nullptr) { + client_.reset(new CollectiveClient()); + } + }); + return client_.get(); + } + + private: + std::unique_ptr rpc_client_; + + static std::once_flag init_flag_; + static std::unique_ptr client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc new file mode 100644 index 00000000000..c95652400c2 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // for removing the port file +#include +#include +#include +#include // NOLINT +#include + +#include "paddle/fluid/operators/distributed/collective_server.h" + +DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag CollectiveServer::init_flag_; +std::shared_ptr CollectiveServer::collective_server_(nullptr); + +CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { + VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; + rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); +} + +void CollectiveServer::Stop() { + rpc_server_->ShutDown(); + server_thread_->join(); + loop_thread_->join(); +} + +void CollectiveServer::StartServer() { + get_monomer_handler_.reset(new GetMonomerHandler()); + get_monomer_handler_->SetRPCServer(rpc_server_.get()); + + get_barrier_handler_.reset(new GetMonomerBarrierHandler()); + get_barrier_handler_->SetRPCServer(rpc_server_.get()); + + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, + get_monomer_handler_.get(), + FLAGS_collective_get_thread_num); + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, + get_barrier_handler_.get(), 1); + + server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); + rpc_server_->WaitServerReady(); + + loop_thread_.reset(new std::thread([&]() { + while (true) { + if (rpc_server_->IsExit()) { + LOG(WARNING) << "get exit!rpc_processor break!"; + break; + } + sleep(1); + } + VLOG(1) << "CollectiveServer loop_thread end"; + })); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h new file mode 100644 index 00000000000..a23dc18b4de --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class CollectiveServer; + +class GetMonomerHandler final : public RequestHandler { + public: + GetMonomerHandler() : RequestHandler(true) {} + virtual ~GetMonomerHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + *outvar = scope->FindVar(var_name); + PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name); + + return true; + } +}; + +class GetMonomerBarrierHandler final : public RequestHandler { + public: + GetMonomerBarrierHandler() : RequestHandler(true) {} + virtual ~GetMonomerBarrierHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + rpc_server_->IncreaseVarBarrier(var_name); + + return true; + } +}; + +class CollectiveServer final { + public: + explicit CollectiveServer(const std::string& end_point, int fan_in); + + virtual ~CollectiveServer() {} + + void StartServer(); + + static CollectiveServer* GetInstance(const std::string& end_point, + int fan_in) { + std::call_once(init_flag_, [&]() { + if (collective_server_.get() == nullptr) { + collective_server_.reset(new CollectiveServer(end_point, fan_in)); + collective_server_->StartServer(); + } + }); + + return collective_server_.get(); + } + + std::shared_ptr GetRPCServer() { return rpc_server_; } + + void Stop(); + + private: + std::unique_ptr get_monomer_handler_; + std::unique_ptr get_barrier_handler_; + + std::shared_ptr rpc_server_; + std::shared_ptr server_thread_; + std::shared_ptr loop_thread_; + + bool ready_{false}; + + static std::once_flag init_flag_; + static std::shared_ptr collective_server_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc new file mode 100644 index 00000000000..0a9c69e3932 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::operators::distributed; + +std::unique_ptr StartServer( + const std::string& ep, int fan_in, framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveServer* server = + distributed::CollectiveServer::GetInstance(ep, fan_in); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, + scope, dev_ctx); + + std::cout << "StartServer return" << std::endl; + return std::unique_ptr(server); +} + +std::unique_ptr GenerateVars(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + auto* slr = var->GetMutable(); + slr->set_height(1000); + + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + + tensor->Resize(framework::make_ddim({3, 5})); + tensor->mutable_data(place); + + paddle::operators::math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 3; ++i) rows->push_back(i); + + std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); + + return std::unique_ptr(scope); +} + +void Gather(const std::vector& vars, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveClient* client = + distributed::CollectiveClient::GetInstance(); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + var->GetMutable(); + + std::vector dst; + client->Gather(vars, &dst, *dev_ctx, scope); + std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); +} + +TEST(PREFETCH, GPU) { + platform::CUDAPlace place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + std::string ep = "127.0.0.1:7164"; + auto scope = GenerateVars(place); + + auto* v1 = scope->FindVar("var1"); + std::cout << "var1:" << v1 << std::endl; + + auto server = StartServer(ep, 2, scope.get(), &ctx); + auto rpc_server = server->GetRPCServer(); + + distributed::RemoteVar var; + var.ep_ = ep; + var.var_name_ = "var1"; + var.trainer_id_ = 0; + + std::vector vars{var}; + Gather(vars, &ctx); + Gather(vars, &ctx); + + std::cout << "begin WaitVarBarrier" << std::endl; + rpc_server->WaitVarBarrier("var1"); + rpc_server->ClearRegisteredVars(); + server->Stop(); + + scope.release(); + server.release(); +} diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index d7f3ea86aff..857214aa211 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -28,11 +28,11 @@ namespace paddle { namespace operators { namespace distributed { -void GRPCClient::InitImpl() { InitEventLoop(); } - -void GRPCClient::InitEventLoop() { +void GRPCClient::InitImpl() { // start the client process thread // TODO(wuyi): can make this in a threadpool + PADDLE_ENFORCE(client_thread_ == nullptr, + "please not re init proceed thread"); client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); } @@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { + VLOG(100) << "ProcGetResponse"; framework::Variable* outvar = nullptr; // get response's trainer_id is not used int trainer_id; @@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetVariable", time_out); +} + +VarHandlePtr GRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetMonomerVariable", time_out); +} + +VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& rpc_path, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] { + framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, return h; } +VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + const auto ch = GetChannel(ep); + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = "SendMonomerFetchBarrierRPC"; + VarHandlePtr h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; + + sendrecv::VariableMessage req; + req.set_varname(var_name); + + platform::RecordRPCEvent record_event(method, nullptr); + + auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index a31a465645e..01bf46cc313 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -189,6 +189,11 @@ class GRPCClient : public RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -200,8 +205,12 @@ class GRPCClient : public RPCClient { VarHandlePtr AsyncSendBatchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) override; + + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, @@ -214,21 +223,22 @@ class GRPCClient : public RPCClient { void SendComplete() override; - protected: void InitImpl() override; private: - // InitEventLoop should only be called by Init() - void InitEventLoop(); - void Proceed(); std::shared_ptr GetChannel(const std::string& ep); + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, const std::string& rpc, + int64_t time_out); private: grpc::CompletionQueue cq_; std::unordered_map> channels_; - std::unique_ptr client_thread_; + std::unique_ptr client_thread_{nullptr}; // mutex for Wait client sync std::mutex sync_mutex_; diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index d9200c98b23..c3974138f4d 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -158,6 +158,98 @@ class RequestGet final : public RequestBase { ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; }; +class RequestGetMonomerVariable final : public RequestBase { + public: + explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, + int req_id, RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerVariable() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + if (outvar) { + SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + RPCServer* rpc_server_{nullptr}; +}; + +class RequestGetMonomerBarrier final : public RequestBase { + public: + explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id, + RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + VLOG(4) << "RequestGetMonomerBarrier " << varname; + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + framework::Scope* scope = nullptr; + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; + RPCServer* rpc_server_{nullptr}; +}; + class RequestPrefetch final : public RequestBase { public: explicit RequestPrefetch(GrpcService::AsyncService* service, @@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase { }; void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is wait server ready"; + VLOG(4) << "AsyncGRPCServer is waiting server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); VLOG(4) << "AsyncGRPCServer WaitSeverReady"; @@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestSend(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGet) { b = new RequestGet(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestGetMonomerVariable) { + b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, + this); + } else if (rpc_name == kRequestGetMonomerBarrier) { + b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id, + this); } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestCheckpoint) { @@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, reqs[req_id] = b; - VLOG(4) << "Create RequestSend status:" << b->Status(); + VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); } void AsyncGRPCServer::HandleRequest( diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h index 9ae9a31a003..537429b5fe9 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -81,10 +81,12 @@ enum class GrpcMethod { kGetVariable, kPrefetchVariable, kCheckpointNotify, + kGetMonomerVariable, + kGetMonomerBarrier, }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kCheckpointNotify) + 1; + static_cast(GrpcMethod::kGetMonomerBarrier) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { @@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetMonomerVariable: + return "/sendrecv.SendRecvService/GetMonomerVariable"; + case GrpcMethod::kGetMonomerBarrier: + return "/sendrecv.SendRecvService/GetMonomerBarrier"; case GrpcMethod::kPrefetchVariable: return "/sendrecv.SendRecvService/PrefetchVariable"; case GrpcMethod::kCheckpointNotify: diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 5272afd4285..62b24f150b4 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -37,6 +37,8 @@ namespace distributed { constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; +constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; +constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 4cd3abb5a61..b668d869787 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -45,6 +45,11 @@ class RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncPrefetchVar( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& in_var_name, @@ -57,6 +62,10 @@ class RPCClient { virtual VarHandlePtr AsyncSendFetchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) = 0; @@ -87,8 +96,9 @@ class RPCClient { } } - protected: virtual void InitImpl() {} + + protected: // each trainer have exact one trainer id, it should be static static int trainer_id_; diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 3e30ed4ac86..122619d41b2 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) { lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); } +void RPCServer::RegisterVar(const std::string& var_name, + const std::string& rpc_name, + framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + MonomerHandle h; + h.var_name_ = var_name; + h.rpc_name_ = rpc_name; + h.scope_ = scope; + h.dev_ctx_ = dev_ctx; + + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + PADDLE_ENFORCE(false, "%s alreay in var_map", var_name); + } + var_map_[var_name] = h; + } + + rpc_cond_.notify_all(); + VLOG(4) << "RegisterVar context:" << h.String(); +} + +void RPCServer::IncreaseVarBarrier(const std::string& var_name) { + int b = 0; + MonomerHandle h; + { + std::unique_lock lock(mutex_); + b = ++var_map_[var_name].barrier_; + h = var_map_[var_name]; + } + + if (b >= client_num_) { + barrier_cond_.notify_all(); + } + + VLOG(4) << "IncreaseVarBarrier context:" << h.String(); +} + +void RPCServer::WaitVarBarrier(const std::string& var_name) { + VLOG(4) << "WaitBarrier var_name:" << var_name; + + std::unique_lock lock(mutex_); + barrier_cond_.wait(lock, [&]() { + return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || + exit_flag_.load()); + }); + + VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); +} + +void RPCServer::SetVarCond(const std::string& var_name) { + VLOG(4) << "SetVarCond var_name:" << var_name; + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + rpc_cond_.notify_all(); + } + } +} + +void RPCServer::WaitVarCond(const std::string& var_name) { + VLOG(4) << "WaitVarCond var_name:" << var_name; + + std::unique_lock lock(mutex_); + rpc_cond_.wait(lock, [=] { + return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); + }); + + VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; +} + +MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { + MonomerHandle h; + { + std::unique_lock lock(mutex_); + h = var_map_[var_name]; + } + + return h; +} + +void RPCServer::ClearRegisteredVars() { + std::unique_lock lock(mutex_); + var_map_.clear(); +} + +void RPCServer::ClearVar(const std::string& var_name) { + std::unique_lock lock(mutex_); + var_map_.erase(var_name); +} } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index c78c5007a7f..45d1d3479ce 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -21,12 +21,30 @@ #include #include +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { namespace distributed { +struct MonomerHandle { + std::string var_name_; + std::string rpc_name_; + framework::Scope* scope_{nullptr}; + platform::DeviceContext* dev_ctx_{nullptr}; + int64_t barrier_{0}; + + std::string String() { + std::stringstream ss; + ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ + << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ + << ", barrier_:" << barrier_; + return ss.str(); + } +}; + class RPCServer { public: explicit RPCServer(const std::string& address, int client_num) @@ -67,6 +85,16 @@ class RPCServer { void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void RegisterVar(const std::string& var_name, const std::string& rpc_name, + framework::Scope* scope, platform::DeviceContext* dev_ctx); + void IncreaseVarBarrier(const std::string& var_name); + void WaitVarBarrier(const std::string& var_name); + void SetVarCond(const std::string& var_name); + void WaitVarCond(const std::string& var_name); + void ClearRegisteredVars(); + void ClearVar(const std::string& var_name); + MonomerHandle GetMonomer(const std::string& var_name); + void Complete(); void ResetBarrierCounter(); @@ -95,6 +123,9 @@ class RPCServer { std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; friend class RequestHandler; + + // TODO(gongwb): use more cond to notify or wait; + std::unordered_map var_map_; }; }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 7b7d069f17f..2637619f304 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -28,6 +28,9 @@ service SendRecvService { rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} + + rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} + rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 31ed5196668..9e99e44822b 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ea07372a289..dca0c01ab22 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -925,6 +925,18 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, int num_trainers) { self.num_trainers_ = num_trainers; }) + .def_property( + "trainers_endpoints", + [](const BuildStrategy &self) { return self.trainers_endpoints_; }, + [](BuildStrategy &self, + const std::vector &trainers_endpoints) { + self.trainers_endpoints_ = trainers_endpoints; + }) + .def_property("trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) .def_property( "fuse_elewise_add_act_ops", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9e6345f148c..1511eea68cb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1483,6 +1483,7 @@ class Program(object): self._is_chief = False self._slice_vars_and_attrs = [] self._endpoints = [] + self._trainers_endpoints = [] self._distributed_lookup_table = None @property diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index dc27a8eabb5..c54c3963a15 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -135,9 +135,17 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id main = main_program main = main if main else framework.default_main_program() + + trainers_endpoints = main._trainers_endpoints + if num_trainers > 1 and trainers_endpoints: + assert num_trainers == len( + trainers_endpoints), "num_trainers == len(end_points)" + build_strategy.trainers_endpoints = trainers_endpoints + if scope == None: scope = executor.global_scope() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 3b898af706e..d21ec42dccd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -305,6 +305,7 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) + self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, -- GitLab From c049fa7cf7a449e26dcd9b044f291ce57a9bd0f2 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 7 Dec 2018 22:24:44 +0800 Subject: [PATCH 0080/2367] Revert "Revert "Revert "Imperative""" --- paddle/fluid/CMakeLists.txt | 1 - paddle/fluid/framework/feed_fetch_method.cc | 9 - paddle/fluid/framework/feed_fetch_method.h | 2 - paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 - paddle/fluid/imperative/engine.cc | 53 ----- paddle/fluid/imperative/engine.h | 39 ---- paddle/fluid/imperative/layer.cc | 221 ------------------ paddle/fluid/imperative/layer.h | 102 -------- paddle/fluid/imperative/tracer.cc | 19 -- paddle/fluid/imperative/tracer.h | 128 ---------- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 --- paddle/fluid/pybind/imperative.h | 53 ----- paddle/fluid/pybind/pybind.cc | 39 ---- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/framework.py | 54 +---- python/paddle/fluid/imperative/__init__.py | 25 -- python/paddle/fluid/imperative/base.py | 56 ----- python/paddle/fluid/imperative/layers.py | 44 ---- python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 ----- python/setup.py.in | 1 - tools/print_signatures.py | 4 - 25 files changed, 19 insertions(+), 960 deletions(-) delete mode 100644 paddle/fluid/imperative/CMakeLists.txt delete mode 100644 paddle/fluid/imperative/engine.cc delete mode 100644 paddle/fluid/imperative/engine.h delete mode 100644 paddle/fluid/imperative/layer.cc delete mode 100644 paddle/fluid/imperative/layer.h delete mode 100644 paddle/fluid/imperative/tracer.cc delete mode 100644 paddle/fluid/imperative/tracer.h delete mode 100644 paddle/fluid/pybind/imperative.cc delete mode 100644 paddle/fluid/pybind/imperative.h delete mode 100644 python/paddle/fluid/imperative/__init__.py delete mode 100644 python/paddle/fluid/imperative/base.py delete mode 100644 python/paddle/fluid/imperative/layers.py delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 595454e90b9..6b526f0103a 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) -add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 6338be75a4b..3e9353f5cf6 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,9 +16,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -55,12 +53,5 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { - Variable* var = scope.FindVar(var_name); - PADDLE_ENFORCE(var, "%s no in scope", var_name); - PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); - return *var->GetMutable(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 031f8e01aa6..7f504bfd232 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,7 +27,5 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe28..fc91564bbae 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,8 +38,9 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt deleted file mode 100644 index 373d292b443..00000000000 --- a/paddle/fluid/imperative/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cc_library(layer SRCS layer.cc DEPS proto_desc operator) -cc_library(tracer SRCS tracer.cc DEPS proto_desc) -cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc deleted file mode 100644 index de7ab0e5918..00000000000 --- a/paddle/fluid/imperative/engine.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/engine.h" - -#include // NOLINT -#include - -#include "glog/logging.h" - -namespace paddle { -namespace imperative { - -static std::once_flag init_engine; -static Engine* engine; - -class DummyEngine : public Engine { - public: - void Enqueue(Runnable* runnable) override { - queued_runnables_.push_back(runnable); - } - - size_t Size() const override { return queued_runnables_.size(); } - - void Sync() override { - for (Runnable* l : queued_runnables_) { - LOG(INFO) << "running " << reinterpret_cast(l); - } - queued_runnables_.clear(); - } - - private: - std::vector queued_runnables_; -}; - -Engine* GetEngine() { - std::call_once(init_engine, []() { engine = new DummyEngine(); }); - return engine; -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h deleted file mode 100644 index a1dfa5bda38..00000000000 --- a/paddle/fluid/imperative/engine.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace paddle { -namespace imperative { - -struct Runnable {}; - -class Engine { - public: - virtual ~Engine() {} - - virtual void Enqueue(Runnable* runnable) = 0; - - virtual size_t Size() const = 0; - - virtual void Sync() = 0; -}; - -Engine* GetEngine(); - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc deleted file mode 100644 index 61250376807..00000000000 --- a/paddle/fluid/imperative/layer.cc +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/layer.h" -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/string/printf.h" - -namespace paddle { -namespace imperative { - -using framework::Variable; - -void AddTo(Variable* src, Variable* dst) { - framework::LoDTensor* dst_tensor = dst->GetMutable(); - framework::LoDTensor* src_tensor = src->GetMutable(); - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", - dst_tensor->numel(), src_tensor->numel()); - float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); - const float* src_data = src_tensor->data(); - for (size_t i = 0; i < src_tensor->numel(); ++i) { - dst_data[i] += src_data[i]; - } -} - -class Autograd { - public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} - - void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; - - std::deque ready; - ready.push_back(var->pre_op_); - - std::map dep_counts = ComputeDepCounts(var->pre_op_); - - while (!ready.empty()) { - OpBase* ready_op = ready.front(); - ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); - } - } - } - } - - private: - std::map ComputeDepCounts(OpBase* op) { - std::map ret; - - std::deque queue; - queue.push_back(op); - std::unordered_set visited; - visited.insert(op); - while (!queue.empty()) { - OpBase* candidate = queue.front(); - queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); - } - ret[pre_op] += 1; - } - } - - return ret; - } - - framework::Scope* scope_; -}; - -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); - framework::LoDTensor* tensor = var->GetMutable(); - - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); - return var; -} - -framework::LoDTensor& VarBase::Grad() { - VLOG(3) << "get var grad " << var_desc_->Name(); - return *grads_->GetMutable(); -} - -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); - } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } - } - } - - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; - std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); - } - } - return ret; -} - -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); - if (!pre_op_) return; - Autograd(scope).RunBackward(this); -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h deleted file mode 100644 index 85a71ca83d2..00000000000 --- a/paddle/fluid/imperative/layer.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace imperative { - -class OpBase; - -class VarBase { - public: - VarBase() - : pre_op_(nullptr), - pre_op_out_idx_(-1), - var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} - - virtual ~VarBase() {} - - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); - - void RunBackward(framework::Scope* scope); - - framework::LoDTensor& Grad(); - - OpBase* pre_op_; - int pre_op_out_idx_; - - framework::VarDesc* var_desc_; - framework::Variable* var_; - framework::Variable* grads_; -}; - -class OpBase { - public: - OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} - - virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - - delete pre_ops_; - delete pre_ops_out_idx_; - - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; - } - - std::vector ApplyGrad(framework::Scope* scope); - - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; - framework::OpDesc* op_desc_; - - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; - framework::BlockDesc* block_; -}; - -class Layer { - public: - virtual ~Layer() {} - - virtual std::vector Forward(const std::vector& inputs) { - std::vector vars; - return vars; - } - - virtual void Backward() { LOG(ERROR) << "To support customize"; } -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc deleted file mode 100644 index f64f9e72c4a..00000000000 --- a/paddle/fluid/imperative/tracer.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace imperative {} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h deleted file mode 100644 index 433d07c0e5a..00000000000 --- a/paddle/fluid/imperative/tracer.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/engine.h" -#include "paddle/fluid/imperative/layer.h" - -namespace paddle { -namespace imperative { - -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, - std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); -} - -class Tracer { - public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - } - - virtual ~Tracer() { delete root_scope_; } - - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, - framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type(); - op_desc->InferShape(*block); - op_desc->InferVarType(block); - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); - - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - } - - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; - } - op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; - op->block_ = block; - } - - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); - } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; - } - - private: - std::map scopes_; - framework::BlockDesc* root_block_; - framework::Scope* root_scope_; -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb1262..d602613fc82 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,7 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) - +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc deleted file mode 100644 index 34e9c897d9e..00000000000 --- a/paddle/fluid/pybind/imperative.cc +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace pybind { - -// Bind Methods -void BindTracer(pybind11::module *m) { - pybind11::class_(*m, "Tracer", "") - .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); - }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); -} - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h deleted file mode 100644 index 7a9d3a01ea8..00000000000 --- a/paddle/fluid/pybind/imperative.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include "paddle/fluid/imperative/layer.h" -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -namespace paddle { -namespace pybind { - -class PyLayer : public imperative::Layer { - public: - using imperative::Layer::Layer; // Inherit constructors - - std::vector Forward( - const std::vector& inputs) override { - PYBIND11_OVERLOAD(std::vector, Layer, Forward, - inputs); // NOLINT - } - - void Backward() override { - PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT - } -}; - -class PyOpBase : public imperative::OpBase { - public: - using imperative::OpBase::OpBase; // Inherit constructors -}; - -class PyVarBase : public imperative::VarBase { - public: - using imperative::VarBase::VarBase; // Inherit constructors -}; - -void BindTracer(pybind11::module* m); - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca0c01ab22..58ef3da0b23 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,7 +34,6 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" -#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -46,7 +45,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -102,42 +100,6 @@ PYBIND11_MODULE(core, m) { BindException(&m); - py::class_(m, "VarBase", R"DOC()DOC") - .def(py::init<>()) - .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) - .def("_grad", &imperative::VarBase::Grad) - .def_property( - "desc", - [](const imperative::VarBase &self) { return self.var_desc_; }, - [](imperative::VarBase &self, framework::VarDesc *var_desc) { - self.var_desc_ = var_desc; - }, - py::return_value_policy::reference); - - py::class_(m, "OpBase", R"DOC()DOC") - .def(py::init<>()) - .def_property( - "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, - [](imperative::OpBase &self, framework::OpDesc *op_desc) { - if (op_desc) { - self.op_desc_ = op_desc; - } - }, - py::return_value_policy::reference); - - py::class_ layer(m, "Layer"); - layer.def(py::init<>()) - .def("forward", - [](imperative::Layer &self, - const std::vector &inputs) { - return self.Forward(inputs); - }) - .def("backward", &imperative::Layer::Backward); - BindTracer(&m); - py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -639,7 +601,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); - m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 52417a1eaf7..2a53519188e 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,7 +34,6 @@ from . import io from . import evaluator from . import initializer from . import layers -from . import imperative from . import contrib from . import nets from . import optimizer @@ -68,7 +67,6 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', - 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1511eea68cb..a40826168dc 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,7 +18,6 @@ import collections import contextlib import re import six -import sys import numpy as np @@ -50,16 +49,6 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() -_imperative_tracer_ = None - - -def _in_imperative_mode(): - return _imperative_tracer_ is not None - - -def _imperative_tracer(): - return _imperative_tracer_ - class NameScope(object): def __init__(self, name="", parent=None): @@ -213,7 +202,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VarBase): +class Variable(object): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -277,7 +266,6 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): - core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -358,18 +346,6 @@ class Variable(core.VarBase): self.stop_gradient = stop_gradient self.is_data = is_data - def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) - return np.array(tensor) - - def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._run_backward(scope) - - def _gradient(self): - return np.array(self._grad()) - def __str__(self): return self.to_string(True) @@ -516,7 +492,7 @@ class OpProtoHolder(object): } -class Operator(core.OpBase): +class Operator(object): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -572,7 +548,6 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): - core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -612,7 +587,6 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -639,13 +613,6 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - - self.outputs = [] if outputs is not None: given = set() need = set() @@ -674,12 +641,6 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) - if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1245,8 +1206,6 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2251,12 +2210,3 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) - - -@contextlib.contextmanager -def _imperative_guard(tracer): - global _imperative_tracer_ - tmp_trace = _imperative_tracer_ - _imperative_tracer_ = tracer - yield - _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py deleted file mode 100644 index 922308b6b18..00000000000 --- a/python/paddle/fluid/imperative/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -from . import base -from .base import * - -from . import layers -from .layers import * - -__all__ = [] -__all__ += layers.__all__ -__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py deleted file mode 100644 index 15d38ddb56c..00000000000 --- a/python/paddle/fluid/imperative/base.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework - -__all__ = ['enabled', 'guard', 'to_variable'] - - -def enabled(): - return framework._in_imperative_mode() - - -@contextlib.contextmanager -def guard(): - train = framework.Program() - startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) - with framework.program_guard(train, startup): - with framework.unique_name.guard(): - with framework._imperative_guard(tracer): - yield - - -def to_variable(value, block=None): - if isinstance(value, np.ndarray): - if not block: - block = framework.default_main_program().current_block() - py_var = framework.Variable( - block, - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=value.shape, - dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) - tensor = var.get_tensor() - tensor.set(value, core.CPUPlace()) - return py_var - elif isinstance(value, framework.Variable): - return value - else: - raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py deleted file mode 100644 index 1a28f7f4ae3..00000000000 --- a/python/paddle/fluid/imperative/layers.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import sys -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework -from paddle.fluid.imperative import base - -__all__ = ['PyLayer'] - - -class PyLayer(core.Layer): - def __init__(self): - pass - - def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - py_var = base.to_variable(x) - var_inputs.append(py_var) - outputs = self.forward(var_inputs) - return outputs - - def forward(self, inputs): - return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db6..dc317de9abb 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,13 +17,10 @@ from __future__ import print_function import copy import itertools import six -import sys -import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier -from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -49,21 +46,23 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() - def to_variable(self, x): - return base.to_variable(x, self.main_program.current_block()) - def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - ret = [] - if isinstance(inputs, list) or isinstance(inputs, tuple): - for inp in inputs: - ret.append(self.to_variable(inp)) + type_error = TypeError( + "Input of {0} layer should be Variable or sequence of Variable". + format(self.layer_type)) + if isinstance(inputs, Variable): + inputs = [inputs] + elif not isinstance(inputs, list) and not isinstance(inputs, tuple): + raise type_error else: - ret.append(self.to_variable(inputs)) - return ret + for each in inputs: + if not isinstance(each, Variable): + raise type_error + return inputs def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fac7538a6ad..4833212d311 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,8 +6623,7 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) + helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py deleted file mode 100644 index b5b6305155d..00000000000 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import sys -import numpy as np - -import paddle.fluid as fluid -from paddle.fluid import core - - -class MyLayer(fluid.imperative.PyLayer): - def __init__(self): - super(MyLayer, self).__init__() - - def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) - self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] - - -class TestImperative(unittest.TestCase): - def test_layer(self): - with fluid.imperative.guard(): - cl = core.Layer() - cl.forward([]) - l = fluid.imperative.PyLayer() - l.forward([]) - - def test_layer_in_out(self): - with fluid.imperative.guard(): - l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] - self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) - x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c7..5aee26b6383 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,7 +101,6 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', - 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 7e61dde0a44..5c5266f904f 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,8 +27,6 @@ import pydoc member_dict = collections.OrderedDict() -experimental_namespace = {"paddle.fluid.imperative"} - def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -53,8 +51,6 @@ def visit_member(parent_name, member): def visit_all_module(mod): - if (mod.__name__ in experimental_namespace): - return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) -- GitLab From 2c6159a151d573ca697e2dfd591720cc854b4b9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 13:59:36 +0000 Subject: [PATCH 0081/2367] fix unittest fix cmake test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- .../test_eager_deletion_dynamic_rnn_base.py | 86 +++++++++++++++++++ .../unittests/test_eager_deletion_gru_net.py | 2 +- .../unittests/test_eager_deletion_lstm_net.py | 67 +-------------- 4 files changed, 92 insertions(+), 67 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f2361c5ceaa..b236eef3cee 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -171,9 +171,9 @@ if(WITH_DISTRIBUTE) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper garbage_collector) else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper garbage_collector) endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py new file mode 100644 index 00000000000..e91cfe0b45a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = None + + def test_network(self): + if self.net is None: + return + + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index 1ec174544cb..5ed3d9fdf3b 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -13,7 +13,7 @@ # limitations under the License. import unittest -from test_eager_deletion_lstm_net import TestBase +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 431765bff2d..8462c06aa56 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -12,60 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' -os.environ['CPU_NUM'] = '2' - -import six -import unittest - -import paddle -import paddle.fluid.core as core +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid - - -def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): - if use_cuda and not core.is_compiled_with_cuda(): - print('Skip use_cuda=True because Paddle is not compiled with cuda') - return - - word_dict = paddle.dataset.imdb.word_dict() - train_reader = paddle.batch( - paddle.dataset.imdb.train(word_dict), batch_size=batch_size) - - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - cost = network(data, label, len(word_dict)) - optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) - optimizer.minimize(cost) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - feeder = fluid.DataFeeder(feed_list=[data, label], place=place) - reader = feeder.decorate_reader( - train_reader, multi_devices=use_parallel_executor) - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - if use_parallel_executor: - train_exe = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=cost.name) - fetch_list = [cost.name] - else: - train_exe = exe - fetch_list = [cost] - - for pass_id in six.moves.xrange(pass_num): - batch_id = 0 - for data in reader(): - train_exe.run(feed=data, - fetch_list=fetch_list if batch_id % 4 == 0 else []) - batch_id += 1 - if batch_id > 16: - break +import unittest def lstm_net(data, @@ -92,20 +41,10 @@ def lstm_net(data, return avg_cost -class TestBase(unittest.TestCase): +class LSTMTest(TestBase): def setUp(self): self.net = lstm_net - def test_network(self): - for use_cuda in [True, False]: - for use_parallel_executor in [False, True]: - print('network: {}, use_cuda: {}, use_parallel_executor: {}'. - format(self.net.__name__, use_cuda, - use_parallel_executor)) - with fluid.program_guard(fluid.Program(), fluid.Program()): - with fluid.scope_guard(core.Scope()): - train(self.net, use_cuda, use_parallel_executor) - if __name__ == "__main__": unittest.main() -- GitLab From bfde5e10ce7a8ee1d95978033015eb6a5103d52c Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:01:11 -0800 Subject: [PATCH 0082/2367] Move ngraph compile control to cmake test=develop --- paddle/fluid/framework/CMakeLists.txt | 24 ++++++++++++++--------- paddle/fluid/framework/executor.cc | 5 ++++- paddle/fluid/framework/ngraph_bridge.cc | 2 -- paddle/fluid/framework/ngraph_bridge.h | 3 --- paddle/fluid/framework/ngraph_operator.cc | 2 -- paddle/fluid/framework/ngraph_operator.h | 3 --- 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e4c471d86b7..ce429fefa77 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -129,11 +129,13 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(NOT WIN32) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) -cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) -endif(NOT WIN32) +if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler ngraph) + endif(NOT WIN32) +endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -169,11 +171,15 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(NOT WIN32) + if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper) + else(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + endif(NOT WIN32) + else(WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(NOT WIN32) + endif(WITH_NGRAPH) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 73cec21e20f..09f08e1f208 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -26,6 +25,10 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_NGRAPH +#include "paddle/fluid/framework/ngraph_operator.h" +#endif + DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index e22c2903771..907c95bddda 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include #include @@ -118,4 +117,3 @@ void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 9ed6b951094..5ad7b8daeb6 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -53,4 +51,3 @@ class NgraphBridge { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 3fea753f065..e3cc6ee9949 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include @@ -548,4 +547,3 @@ void NgraphOperator::Run(const Scope& scope, } // NgraphOperator::RunImpl } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 3ca023e1111..85d015bd463 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -64,4 +62,3 @@ class FusedOperator : public OperatorBase { }; } // namespace framework } // namespace paddle -#endif -- GitLab From 22ac2133e4d302d9d843c9d1c2b2d554d83c1e2e Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:22:15 -0800 Subject: [PATCH 0083/2367] Rename class test=develop --- paddle/fluid/framework/executor.cc | 8 ++-- paddle/fluid/framework/ngraph_operator.cc | 57 +++++++++++------------ paddle/fluid/framework/ngraph_operator.h | 6 +-- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 09f08e1f208..e97cf44c75c 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -91,11 +91,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, static void EnableFusedOp(ExecutorPrepareContext* ctx) { #ifdef PADDLE_WITH_NGRAPH VLOG(3) << "use_ngraph=True"; - auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_); + auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); for (auto& interval : intervals) { - auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_, - interval.at(0), interval.at(1)); - *interval[0] = std::unique_ptr(fused_op); + auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), + interval.at(1)); + *interval[0] = std::unique_ptr(ng_op); } for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { ctx->ops_.erase(it->at(0) + 1, it->at(1)); diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e3cc6ee9949..253de4c6116 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -57,16 +57,16 @@ typedef enum { /* nGraph support state on ops */ } op_state; // perform graph build through bridge and execute computation -class NgraphOperator { +class NgraphEngine { public: - explicit NgraphOperator(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) + explicit NgraphEngine(const Scope& scope, const platform::Place& place, + const std::vector>& ops, + const std::unordered_map< + std::string, ngraph::element::Type>& var_type_map, + const std::unordered_set& persist, + const std::unordered_set& fetches, + const std::unordered_set& post_op_inputs, + op_state ng_op_state) : scope_(scope), place_(place), fused_ops_(ops), @@ -131,7 +131,7 @@ class NgraphOperator { }; std::vector>::iterator>> -FusedOperator::FusedOpIntervals( +NgraphOperator::NgraphOpIntervals( std::vector>* ops) { std::vector>::iterator>> intervals; @@ -184,7 +184,7 @@ FusedOperator::FusedOpIntervals( return intervals; } -FusedOperator::FusedOperator( +NgraphOperator::NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, @@ -214,7 +214,7 @@ FusedOperator::FusedOperator( Process(); } -void FusedOperator::Process() { +void NgraphOperator::Process() { auto& bdesc = pdesc_.Block(block_); for (auto& var : bdesc.AllVars()) { if (!(var->GetType() == proto::VarType::SELECTED_ROWS || @@ -250,8 +250,8 @@ void FusedOperator::Process() { } } -void FusedOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { +void NgraphOperator::RunImpl(const Scope& scope, + const platform::Place& place) const { op_state ng_op_state = PARTIAL_TEST; auto& bdesc = pdesc_.Block(block_); for (auto* op : bdesc.AllOps()) { @@ -265,19 +265,19 @@ void FusedOperator::RunImpl(const Scope& scope, ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } - NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_op.Run(scope, place); + NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, + persistables_, fetches_, post_op_inputs_, + ng_op_state); + ngraph_engine.Run(scope, place); } std::unordered_map> - NgraphOperator::func_cache_ = {}; + NgraphEngine::func_cache_ = {}; -std::shared_ptr NgraphOperator::backend_ = +std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); -void NgraphOperator::GetNgInputShape(std::shared_ptr op) { +void NgraphEngine::GetNgInputShape(std::shared_ptr op) { op->RuntimeInferShape(scope_, place_); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { @@ -300,7 +300,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr op) { } } -void NgraphOperator::BuildNgNodes() { +void NgraphEngine::BuildNgNodes() { for (auto& var_name : var_out_) { if (var_node_map_->find(var_name) == var_node_map_->end()) { auto* var = scope_.FindVar(var_name); @@ -322,7 +322,7 @@ void NgraphOperator::BuildNgNodes() { } } -void NgraphOperator::BuildNgIO() { +void NgraphEngine::BuildNgIO() { std::unordered_set inputs; std::unordered_set outputs; @@ -394,7 +394,7 @@ void NgraphOperator::BuildNgIO() { } } -void NgraphOperator::BuildNgFunction() { +void NgraphEngine::BuildNgFunction() { BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; @@ -415,7 +415,7 @@ void NgraphOperator::BuildNgFunction() { std::make_shared(func_outputs, func_inputs); } -std::shared_ptr NgraphOperator::GetCacheKey() { +std::shared_ptr NgraphEngine::GetCacheKey() { auto cache_key = std::make_shared(""); *cache_key += std::to_string(fused_ops_.size()); for (auto& op : fused_ops_) { @@ -443,7 +443,7 @@ std::shared_ptr NgraphOperator::GetCacheKey() { return cache_key; } -void NgraphOperator::GetNgFunction() { +void NgraphEngine::GetNgFunction() { bool cache_on = true; if (cache_on) { std::string cache_key_val = *GetCacheKey(); @@ -458,8 +458,7 @@ void NgraphOperator::GetNgFunction() { } } -void NgraphOperator::Run(const Scope& scope, - const platform::Place& place) const { +void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { std::vector> t_in; std::vector> t_out; @@ -544,6 +543,6 @@ void NgraphOperator::Run(const Scope& scope, } backend_->call(ngraph_function_, t_out, t_in); -} // NgraphOperator::RunImpl +} // NgraphEngine::RunImpl } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 85d015bd463..ede80f44bea 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -32,14 +32,14 @@ limitations under the License. */ namespace paddle { namespace framework { -class FusedOperator : public OperatorBase { +class NgraphOperator : public OperatorBase { public: static std::vector< std::vector>::iterator>> - FusedOpIntervals( + NgraphOpIntervals( std::vector>* ops); - explicit FusedOperator( + explicit NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, -- GitLab From fddbd87c0a68861dc28ed691a2044c86a08af6fa Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:25:04 -0800 Subject: [PATCH 0084/2367] Rename argument test=develop --- paddle/fluid/framework/ngraph_bridge.cc | 27 +++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 907c95bddda..a5acfd70449 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -26,14 +26,15 @@ namespace paddle { namespace framework { static std::shared_ptr GetNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, const VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -42,42 +43,42 @@ static std::shared_ptr GetNode( } static std::shared_ptr GetInputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } static std::shared_ptr GetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } static void SetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } static bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } template -- GitLab From 943ad4781f5c96573e9615668c1cc73dcb378356 Mon Sep 17 00:00:00 2001 From: bingyanghuang <33643817+bingyanghuang@users.noreply.github.com> Date: Sat, 8 Dec 2018 10:54:33 +0800 Subject: [PATCH 0085/2367] One possible solution to add flexibility for mkldnn placement pass (#14768) * Choose to turn on use_mkldnn attribute v1 * Fix mkldnn_op empty bug * format change test=develop * fix ci test=develop * fix ci test and add test in dam test=develop * add example to dam compare test test=develop * review changes test=develop --- paddle/fluid/framework/ir/mkldnn_placement_pass.cc | 14 +++++++++++--- paddle/fluid/inference/analysis/argument.h | 4 ++++ paddle/fluid/inference/analysis/ir_pass_manager.cc | 5 +++++ paddle/fluid/inference/api/analysis_config.cc | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.cc | 4 ++++ .../fluid/inference/api/paddle_analysis_config.h | 5 +++++ .../inference/tests/api/analyzer_dam_tester.cc | 4 ++++ 7 files changed, 41 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 1cf1315d3d3..9a9314161b0 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" +#include namespace paddle { namespace framework { @@ -21,9 +22,16 @@ namespace ir { std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; + const auto& op_types_list = + Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { - n->Op()->SetAttr("use_mkldnn", true); + if (op_types_list.empty()) { + n->Op()->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + n->Op()->SetAttr("use_mkldnn", true); + } } } return graph; @@ -33,5 +41,5 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( } // namespace framework } // namespace paddle -REGISTER_PASS(mkldnn_placement_pass, - paddle::framework::ir::MKLDNNPlacementPass); +REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) + .RequirePassAttr("mkldnn_enabled_op_types"); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 53cc7039f20..83d411eecf6 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -116,6 +116,10 @@ struct Argument { DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, std::vector); + // Pass a set of op types to enable its mkldnn kernel + DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, + std::unordered_set); + DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index fce5e1cac92..51bca8039d4 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; } + if (pass_name == "mkldnn_placement_pass") { + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set( + argument->mkldnn_enabled_op_types())); + } if (pass_name == "tensorrt_subgraph_pass") { PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 384d1dc27d6..dcefdd92f51 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -49,6 +49,10 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; @@ -77,6 +81,10 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 84f7eca0570..be51e7fc1f0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -327,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); } + if (config_.use_mkldnn_) { + argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); + } + auto passes = config_.pass_builder()->AllPasses(); if (!config_.enable_ir_optim) passes.clear(); argument_.SetIrAnalysisPasses(passes); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index a08e3d027e0..f05b9832da5 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -16,6 +16,7 @@ #include #include #include +#include #include // Here we include some header files with relative paths, for that in deploy, @@ -53,6 +54,9 @@ struct AnalysisConfig : public NativeConfig { void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } + void SetMKLDNNOp(std::unordered_set op_list) { + mkldnn_enabled_op_types_ = op_list; + } // Specify the memory buffer of program and parameter void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, @@ -64,6 +68,7 @@ struct AnalysisConfig : public NativeConfig { protected: bool use_tensorrt_{false}; bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; int tensorrt_workspace_size_; int tensorrt_max_batchsize_; std::unique_ptr pass_builder_; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index e8abcfce05f..227e2ff4587 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -194,6 +194,8 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); } std::vector outputs; @@ -236,6 +238,8 @@ void compare(bool use_mkldnn = false) { SetConfig(&cfg); if (use_mkldnn) { cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); } std::vector> input_slots_all; -- GitLab From bb2e7f0bbed1cfcf47b5b8e90bc9e35b46c13b50 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Sat, 8 Dec 2018 12:31:33 +0800 Subject: [PATCH 0086/2367] add scope in prefetch --- paddle/fluid/operators/distributed/parameter_prefetch.cc | 8 ++++---- paddle/fluid/operators/distributed/parameter_prefetch.h | 3 ++- paddle/fluid/operators/nce_op.h | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 67b56bd2180..f6a2d5bbe52 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -104,7 +104,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::vector>& splited_ids, const framework::ExecutionContext& context, const framework::Scope& actual_scope, framework::Scope* scope, - platform::DeviceContext* actual_ctx, ) { + platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -175,7 +175,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = scope.NewScope(); + auto& local_scope = context.scope().NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -192,7 +192,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, out_var_names.push_back(out_name + "@" + epmap[i]); } - auto& id_tensor = local_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); @@ -248,7 +248,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, context, scope, &local_scope, &actual_ctx); - scope.DeleteScope(&local_scope); + context.scope().DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53b0fbfb51f..53482c4c40e 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -27,7 +27,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context); + const framework::ExecutionContext& context, + const framework::Scope& scope); }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 9789e303889..2e51c67401f 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -180,7 +180,7 @@ class NCEKernel : public framework::OpKernel { labels.size() * sizeof(int64_t)); local_scope.Var("Weight@Local") - ->GetMutable() + ->GetMutable() ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE @@ -194,7 +194,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -208,8 +208,9 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - context.scope().DeleteScope(&local_scope); - + if (context.scope().HasKid(&local_scope)) { + context.scope().DeleteScope(&local_scope); + } } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); -- GitLab From 748549b2e31c510d462ebe1a5421e139269c1a3c Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 9 Dec 2018 12:31:54 +0800 Subject: [PATCH 0087/2367] Revert "Merge pull request #14798 from PaddlePaddle/revert-14786-revert-14782-revert-14398-imperative" This reverts commit b1d3a1c8b41fdb4cfcb58ec2d4fb938b09dac057, reversing changes made to f1fb64b17fb0290e7e1f110069de19b0ea0d0474. --- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/framework/feed_fetch_method.cc | 9 + paddle/fluid/framework/feed_fetch_method.h | 2 + paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 + paddle/fluid/imperative/engine.cc | 53 +++++ paddle/fluid/imperative/engine.h | 39 ++++ paddle/fluid/imperative/layer.cc | 221 ++++++++++++++++++ paddle/fluid/imperative/layer.h | 102 ++++++++ paddle/fluid/imperative/tracer.cc | 19 ++ paddle/fluid/imperative/tracer.h | 128 ++++++++++ paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 +++ paddle/fluid/pybind/imperative.h | 53 +++++ paddle/fluid/pybind/pybind.cc | 39 ++++ python/paddle/fluid/__init__.py | 2 + python/paddle/fluid/framework.py | 54 ++++- python/paddle/fluid/imperative/__init__.py | 25 ++ python/paddle/fluid/imperative/base.py | 56 +++++ python/paddle/fluid/imperative/layers.py | 44 ++++ python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 +++++ python/setup.py.in | 1 + tools/print_signatures.py | 4 + 25 files changed, 960 insertions(+), 19 deletions(-) create mode 100644 paddle/fluid/imperative/CMakeLists.txt create mode 100644 paddle/fluid/imperative/engine.cc create mode 100644 paddle/fluid/imperative/engine.h create mode 100644 paddle/fluid/imperative/layer.cc create mode 100644 paddle/fluid/imperative/layer.h create mode 100644 paddle/fluid/imperative/tracer.cc create mode 100644 paddle/fluid/imperative/tracer.h create mode 100644 paddle/fluid/pybind/imperative.cc create mode 100644 paddle/fluid/pybind/imperative.h create mode 100644 python/paddle/fluid/imperative/__init__.py create mode 100644 python/paddle/fluid/imperative/base.py create mode 100644 python/paddle/fluid/imperative/layers.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103a..595454e90b9 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf6..6338be75a4b 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + return *var->GetMutable(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 7f504bfd232..031f8e01aa6 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bbae..8679118fe28 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); + LOG(ERROR) << "Cannot add backward operator before forward operator " + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 00000000000..373d292b443 --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_library(layer SRCS layer.cc DEPS proto_desc operator) +cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 00000000000..de7ab0e5918 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 00000000000..a1dfa5bda38 --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 00000000000..61250376807 --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace imperative { + +using framework::Variable; + +void AddTo(Variable* src, Variable* dst) { + framework::LoDTensor* dst_tensor = dst->GetMutable(); + framework::LoDTensor* src_tensor = src->GetMutable(); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", + dst_tensor->numel(), src_tensor->numel()); + float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); + const float* src_data = src_tensor->data(); + for (size_t i = 0; i < src_tensor->numel(); ++i) { + dst_data[i] += src_data[i]; + } +} + +class Autograd { + public: + explicit Autograd(framework::Scope* scope) : scope_(scope) {} + + void RunBackward(VarBase* var) { + PADDLE_ENFORCE(var->pre_op_->op_desc_); + // TODO(panyx0718): Only create for vars that "require_grad" + (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + + std::deque ready; + ready.push_back(var->pre_op_); + + std::map dep_counts = ComputeDepCounts(var->pre_op_); + + while (!ready.empty()) { + OpBase* ready_op = ready.front(); + ready.pop_front(); + std::vector input_grads = ready_op->ApplyGrad(scope_); + + for (size_t i = 0; i < input_grads.size(); ++i) { + if (!input_grads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_->at(i); + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } + } + } + } + + private: + std::map ComputeDepCounts(OpBase* op) { + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (OpBase* pre_op : *(candidate->pre_ops_)) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + + return ret; + } + + framework::Scope* scope_; +}; + +framework::Variable* CreateVariable(const std::string& name, + const framework::DDim& dim, float val, + framework::Scope* scope, + bool random_name = true) { + std::string varname = name; + if (random_name) { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + int id = dist6(rng); + varname = string::Sprintf("%s@%d", varname, id); + } + + VLOG(3) << "creating var " << varname; + framework::Variable* var = scope->Var(varname); + framework::LoDTensor* tensor = var->GetMutable(); + + float* data = tensor->mutable_data(dim, platform::CPUPlace()); + std::fill(data, data + tensor->numel(), val); + return var; +} + +framework::LoDTensor& VarBase::Grad() { + VLOG(3) << "get var grad " << var_desc_->Name(); + return *grads_->GetMutable(); +} + +void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + VLOG(3) << "apply var grad " << var_desc_->Name() << " " + << grad->Get().data()[0]; + if (!grads_) { + grads_ = + CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), + var_->Get().dims(), 0.0, scope); + } + AddTo(grad, grads_); + VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " + << grads_->Get().data()[0]; +} + +std::vector OpBase::ApplyGrad(framework::Scope* scope) { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + + for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { + if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + // grad op inputs can be forward inputs, so not in grad_to_var. + continue; + } + VLOG(3) << "op grad in var " << grad_invar; + block_->FindRecursiveOrCreateVar(grad_invar); + framework::Variable* var = scope->Var(grad_invar); + const std::string& invar = grad_to_var_->at(grad_invar); + for (VarBase* varbase : *output_vars_) { + // Use the accumulated grads_ by sharing the input with grads_. + if (varbase->var_desc_->Name() == invar) { + var->GetMutable()->ShareDataWith( + varbase->grads_->Get()); + break; + } + } + } + + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + VLOG(3) << "grad outvar " << outvar; + block_->FindRecursiveOrCreateVar(outvar); + framework::Variable* var = scope->Var(outvar); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(outvar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + + opbase->Run(*scope, platform::CPUPlace()); + + // `ret` matches exactly with `input_vars_` of forward op. + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + bool found = false; + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + Variable* var = scope->FindVar(outvar); + VarBase* origin_var = (*input_vars_)[i]; + std::string orig_var = grad_to_var_->at(outvar); + PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; + origin_var->ApplyGrad(scope, var); + found = true; + ret.push_back(var); + // TODO(panyx0718): There might be another outvar with the same name. + // In that case, it doesn't matter the first one or the second one is + // used. + break; + } + if (!found) { + ret.push_back(nullptr); + } + } + return ret; +} + +void VarBase::RunBackward(framework::Scope* scope) { + grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), + var_->Get().dims(), 1.0, scope, + false); + if (!pre_op_) return; + Autograd(scope).RunBackward(this); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 00000000000..85a71ca83d2 --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace imperative { + +class OpBase; + +class VarBase { + public: + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(nullptr), + grads_(nullptr) {} + + virtual ~VarBase() {} + + void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + + void RunBackward(framework::Scope* scope); + + framework::LoDTensor& Grad(); + + OpBase* pre_op_; + int pre_op_out_idx_; + + framework::VarDesc* var_desc_; + framework::Variable* var_; + framework::Variable* grads_; +}; + +class OpBase { + public: + OpBase() + : input_vars_(new std::vector()), + output_vars_(new std::vector()), + pre_ops_(new std::vector()), + pre_ops_out_idx_(new std::vector()), + op_desc_(nullptr), + grad_op_desc_(nullptr) {} + + virtual ~OpBase() { + delete input_vars_; + delete output_vars_; + + delete pre_ops_; + delete pre_ops_out_idx_; + + if (grad_op_desc_) delete grad_op_desc_; + if (grad_to_var_) delete grad_to_var_; + } + + std::vector ApplyGrad(framework::Scope* scope); + + std::vector* input_vars_; + std::vector* output_vars_; + std::vector* pre_ops_; + std::vector* pre_ops_out_idx_; + framework::OpDesc* op_desc_; + + framework::OpDesc* grad_op_desc_; + std::unordered_map* grad_to_var_; + framework::BlockDesc* block_; +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual std::vector Forward(const std::vector& inputs) { + std::vector vars; + return vars; + } + + virtual void Backward() { LOG(ERROR) << "To support customize"; } +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 00000000000..f64f9e72c4a --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 00000000000..433d07c0e5a --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var) { + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); + // TODO(panyx0718): Leak? + *grad_op_desc = grad_op_descs[0].release(); +} + +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + root_scope_ = new framework::Scope(); + scopes_[root_block_] = root_scope_; + } + + virtual ~Tracer() { delete root_scope_; } + + void Trace(OpBase* op, const std::vector& inputs, + const std::vector& outputs, + framework::BlockDesc* block) { + framework::Scope* scope = GetScope(block); + framework::OpDesc* op_desc = op->op_desc_; + VLOG(3) << "tracer tracing " << op_desc->Type(); + op_desc->InferShape(*block); + op_desc->InferVarType(block); + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(*op_desc); + + *op->input_vars_ = inputs; + for (VarBase* input : inputs) { + const std::string vname = input->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + input->var_ = var; + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + if (input->pre_op_) { + op->pre_ops_->push_back(input->pre_op_); + op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); + } else { + op->pre_ops_->push_back(nullptr); + } + } + + *op->output_vars_ = outputs; + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string vname = outputs[i]->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + outputs[i]->var_ = var; + outputs[i]->pre_op_ = op; + outputs[i]->pre_op_out_idx_ = i; + } + op_base->Run(*scope, platform::CPUPlace()); + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + op->block_ = block; + } + + framework::Scope* GetScope(framework::BlockDesc* block) { + if (scopes_.find(block) != scopes_.end()) { + return scopes_.at(block); + } + framework::BlockDesc* parent_block = block->ParentBlock(); + PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); + framework::Scope* scope = &scopes_[parent_block]->NewScope(); + scopes_[block] = scope; + return scope; + } + + private: + std::map scopes_; + framework::BlockDesc* root_block_; + framework::Scope* root_scope_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d602613fc82..b8954cb1262 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) + if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc new file mode 100644 index 00000000000..34e9c897d9e --- /dev/null +++ b/paddle/fluid/pybind/imperative.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace pybind { + +// Bind Methods +void BindTracer(pybind11::module *m) { + pybind11::class_(*m, "Tracer", "") + .def("__init__", + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); + }) + .def("trace", &imperative::Tracer::Trace) + .def("get_scope", &imperative::Tracer::GetScope, + pybind11::return_value_policy::reference); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h new file mode 100644 index 00000000000..7a9d3a01ea8 --- /dev/null +++ b/paddle/fluid/pybind/imperative.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +class PyLayer : public imperative::Layer { + public: + using imperative::Layer::Layer; // Inherit constructors + + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, + inputs); // NOLINT + } + + void Backward() override { + PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + } +}; + +class PyOpBase : public imperative::OpBase { + public: + using imperative::OpBase::OpBase; // Inherit constructors +}; + +class PyVarBase : public imperative::VarBase { + public: + using imperative::VarBase::VarBase; // Inherit constructors +}; + +void BindTracer(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b23..dca0c01ab22 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -45,6 +46,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_(m, "VarBase", R"DOC()DOC") + .def(py::init<>()) + .def("_run_backward", + [](imperative::VarBase &self, framework::Scope *scope) { + self.RunBackward(scope); + }) + .def("_grad", &imperative::VarBase::Grad) + .def_property( + "desc", + [](const imperative::VarBase &self) { return self.var_desc_; }, + [](imperative::VarBase &self, framework::VarDesc *var_desc) { + self.var_desc_ = var_desc; + }, + py::return_value_policy::reference); + + py::class_(m, "OpBase", R"DOC()DOC") + .def(py::init<>()) + .def_property( + "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, + [](imperative::OpBase &self, framework::OpDesc *op_desc) { + if (op_desc) { + self.op_desc_ = op_desc; + } + }, + py::return_value_policy::reference); + + py::class_ layer(m, "Layer"); + layer.def(py::init<>()) + .def("forward", + [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); + }) + .def("backward", &imperative::Layer::Backward); + BindTracer(&m); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188e..52417a1eaf7 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,6 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers +from . import imperative from . import contrib from . import nets from . import optimizer @@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', + 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a40826168dc..1511eea68cb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import sys import numpy as np @@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +_imperative_tracer_ = None + + +def _in_imperative_mode(): + return _imperative_tracer_ is not None + + +def _imperative_tracer(): + return _imperative_tracer_ + class NameScope(object): def __init__(self, name="", parent=None): @@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(object): +class Variable(core.VarBase): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -266,6 +277,7 @@ class Variable(object): stop_gradient=False, is_data=False, **kwargs): + core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -346,6 +358,18 @@ class Variable(object): self.stop_gradient = stop_gradient self.is_data = is_data + def _numpy(self): + scope = _imperative_tracer().get_scope(self.block.desc) + tensor = core.get_variable_tensor(scope, self.desc.name()) + return np.array(tensor) + + def _backward(self): + scope = _imperative_tracer().get_scope(self.block.desc) + self._run_backward(scope) + + def _gradient(self): + return np.array(self._grad()) + def __str__(self): return self.to_string(True) @@ -492,7 +516,7 @@ class OpProtoHolder(object): } -class Operator(object): +class Operator(core.OpBase): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -548,6 +572,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): + core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -587,6 +612,7 @@ class Operator(object): return True return False + self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -613,6 +639,13 @@ class Operator(object): else: self.desc.set_input(in_proto.name, []) + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + + self.outputs = [] if outputs is not None: given = set() need = set() @@ -641,6 +674,12 @@ class Operator(object): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) + if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1206,6 +1245,8 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2210,3 +2251,12 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) + + +@contextlib.contextmanager +def _imperative_guard(tracer): + global _imperative_tracer_ + tmp_trace = _imperative_tracer_ + _imperative_tracer_ = tracer + yield + _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py new file mode 100644 index 00000000000..922308b6b18 --- /dev/null +++ b/python/paddle/fluid/imperative/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import base +from .base import * + +from . import layers +from .layers import * + +__all__ = [] +__all__ += layers.__all__ +__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py new file mode 100644 index 00000000000..15d38ddb56c --- /dev/null +++ b/python/paddle/fluid/imperative/base.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework + +__all__ = ['enabled', 'guard', 'to_variable'] + + +def enabled(): + return framework._in_imperative_mode() + + +@contextlib.contextmanager +def guard(): + train = framework.Program() + startup = framework.Program() + tracer = core.Tracer(train.current_block().desc) + with framework.program_guard(train, startup): + with framework.unique_name.guard(): + with framework._imperative_guard(tracer): + yield + + +def to_variable(value, block=None): + if isinstance(value, np.ndarray): + if not block: + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + scope = framework._imperative_tracer().get_scope(block.desc) + var = scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set(value, core.CPUPlace()) + return py_var + elif isinstance(value, framework.Variable): + return value + else: + raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py new file mode 100644 index 00000000000..1a28f7f4ae3 --- /dev/null +++ b/python/paddle/fluid/imperative/layers.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import sys +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework +from paddle.fluid.imperative import base + +__all__ = ['PyLayer'] + + +class PyLayer(core.Layer): + def __init__(self): + pass + + def __call__(self, inputs): + # TODO(panyx0718): Support declarative mode as well. + assert base.enabled() + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + py_var = base.to_variable(x) + var_inputs.append(py_var) + outputs = self.forward(var_inputs) + return outputs + + def forward(self, inputs): + return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index dc317de9abb..74b4a977db6 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,10 +17,13 @@ from __future__ import print_function import copy import itertools import six +import sys +import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier +from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -46,23 +49,21 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() + def to_variable(self, x): + return base.to_variable(x, self.main_program.current_block()) + def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - type_error = TypeError( - "Input of {0} layer should be Variable or sequence of Variable". - format(self.layer_type)) - if isinstance(inputs, Variable): - inputs = [inputs] - elif not isinstance(inputs, list) and not isinstance(inputs, tuple): - raise type_error + ret = [] + if isinstance(inputs, list) or isinstance(inputs, tuple): + for inp in inputs: + ret.append(self.to_variable(inp)) else: - for each in inputs: - if not isinstance(each, Variable): - raise type_error - return inputs + ret.append(self.to_variable(inputs)) + return ret def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4833212d311..fac7538a6ad 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,7 +6623,8 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + helper.append_op( + type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py new file mode 100644 index 00000000000..b5b6305155d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import sys +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +class MyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs[0]) + self._x_for_debug = x + return [fluid.layers.elementwise_mul(x, x)] + + +class TestImperative(unittest.TestCase): + def test_layer(self): + with fluid.imperative.guard(): + cl = core.Layer() + cl.forward([]) + l = fluid.imperative.PyLayer() + l.forward([]) + + def test_layer_in_out(self): + with fluid.imperative.guard(): + l = MyLayer() + x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + x._backward() + sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 5aee26b6383..0eb69cdb5c7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,6 +101,7 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', + 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 5c5266f904f..7e61dde0a44 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,6 +27,8 @@ import pydoc member_dict = collections.OrderedDict() +experimental_namespace = {"paddle.fluid.imperative"} + def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -51,6 +53,8 @@ def visit_member(parent_name, member): def visit_all_module(mod): + if (mod.__name__ in experimental_namespace): + return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) -- GitLab From 68c2025844d8a33fa229d60cad431baf86ee1d91 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 12:35:36 +0800 Subject: [PATCH 0088/2367] fix nn.py&API.spec, test=develop --- paddle/fluid/API.spec | 14 ++++++++++++++ python/paddle/fluid/layers/nn.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9a90ad4e934..a61b93af357 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -423,3 +423,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5499a0ba83c..9233fe130e8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1350,6 +1350,21 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): def bpr_loss(input, label_pos): + """ + Bayesian Personalized Ranking Loss Operator. + + This operator belongs to pairwise ranking loss. LabelPos is the desired item. + The loss at a given point in one session is defined as: + $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j(https://arxiv.org/abs/1511.06939) + + Examples: + .. code-block:: python + + cost = fluid.layers.bpr_loss(input=predict, label_pos=label) + """ helper = LayerHelper('bpr_loss', **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) -- GitLab From f6dc09e98b85065594eb5faa9752c13133d73ff7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 9 Dec 2018 13:28:14 +0800 Subject: [PATCH 0089/2367] void hurting declarative performance test=develop --- python/paddle/fluid/framework.py | 47 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1511eea68cb..4bf0a456b52 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -213,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VarBase): +class Variable(object): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -277,7 +277,6 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): - core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -357,6 +356,9 @@ class Variable(core.VarBase): self.op = None self.stop_gradient = stop_gradient self.is_data = is_data + if _in_imperative_mode(): + self._ivar = core.VarBase() + self._ivar.desc = self.desc def _numpy(self): scope = _imperative_tracer().get_scope(self.block.desc) @@ -365,10 +367,10 @@ class Variable(core.VarBase): def _backward(self): scope = _imperative_tracer().get_scope(self.block.desc) - self._run_backward(scope) + self._ivar._run_backward(scope) def _gradient(self): - return np.array(self._grad()) + return np.array(self._ivar._grad()) def __str__(self): return self.to_string(True) @@ -516,7 +518,7 @@ class OpProtoHolder(object): } -class Operator(core.OpBase): +class Operator(object): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -572,7 +574,6 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): - core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -612,7 +613,6 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -639,13 +639,6 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - - self.outputs = [] if outputs is not None: given = set() need = set() @@ -674,12 +667,6 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) - if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -694,6 +681,23 @@ class Operator(core.OpBase): if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + if _in_imperative_mode(): + self.iop = core.OpBase() + self.iop.desc = self.desc + self.inputs = [] + if inputs is not None: + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + self.outputs = [] + if outputs is not None: + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1246,7 +1250,8 @@ class Block(object): op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.append(op) return op -- GitLab From ea95f9c335bc8336a840190c2cdea543e87e4f71 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 21:40:04 +0800 Subject: [PATCH 0090/2367] fix style bug, test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 41f2969e6ca..075b1b2c765 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -28,12 +28,12 @@ class BprLossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_Pos_dims = ctx->GetInputDim("LabelPos"); + auto label_pos_dims = ctx->GetInputDim("LabelPos"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, label_Pos_dims.size(), + PADDLE_ENFORCE_EQ(rank, label_pos_dims.size(), "Input(X) and Input(LabelPos) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_Pos_dims, 0, rank - 1), + framework::slice_ddim(label_pos_dims, 0, rank - 1), "Input(X) and Input(LabelPos) shall have the same shape " "except the last dimension."); -- GitLab From a672b291e5bbee4604e74ce2c46991b4709fed1d Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 22:08:44 +0800 Subject: [PATCH 0091/2367] fix code style, test=develop --- paddle/fluid/operators/bpr_loss_op.h | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index ea817bb2391..ab68165942a 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -22,7 +22,9 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; - +/*Todo: + *Find a way to adapt TolerableValue, using blas or eigen. + */ template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { @@ -86,27 +88,27 @@ class BprLossGradientOpKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); const int step_size = x->dims()[0]; - const int num_classes_ = x->dims()[1]; - T* dx_ = dx->mutable_data(ctx.GetPlace()); - const T* dy_ = dy->data(); - const T* x_ = x->data(); - const int64_t* label_pos_ = label_pos->data(); + const int num_classes = x->dims()[1]; + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_pos_data = label_pos->data(); for (size_t sample_id = 0; sample_id < step_size; sample_id++) { - for (size_t x_offset = sample_id * num_classes_; - x_offset < (sample_id + 1) * num_classes_; x_offset++) { - dx_[x_offset] = static_cast(0); + for (size_t x_offset = sample_id * num_classes; + x_offset < (sample_id + 1) * num_classes; x_offset++) { + dx_data[x_offset] = static_cast(0); } - auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; - for (size_t ni = 0; ni < num_classes_; ni++) { - if (label_pos_[sample_id] == ni) continue; - auto n_index = sample_id * num_classes_ + ni; - auto grad_ = - -dy_[sample_id] / - ((num_classes_ - 1) * - (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); - dx_[p_index] += grad_; - dx_[n_index] -= grad_; + auto p_index = sample_id * num_classes + label_pos_data[sample_id]; + for (size_t ni = 0; ni < num_classes; ni++) { + if (label_pos_data[sample_id] == ni) continue; + auto n_index = sample_id * num_classes + ni; + auto grad_ = -dy_data[sample_id] / + ((num_classes - 1) * + (1.0f + TolerableValue()(std::exp(x_data[p_index] - + x_data[n_index])))); + dx_data[p_index] += grad_; + dx_data[n_index] -= grad_; } } } -- GitLab From f0c0bf328d8141fd041d5d69521d0995e58b2625 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 10 Dec 2018 10:47:59 +0800 Subject: [PATCH 0092/2367] Add gperftools supports for PE --- CMakeLists.txt | 8 ++- cmake/FindGperftools.cmake | 63 +++++++++++++++++++++ cmake/generic.cmake | 16 ++++++ paddle/fluid/framework/parallel_executor.cc | 30 +++++++++- 4 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 cmake/FindGperftools.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba24..1594e798a2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) -option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler" OFF) +option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) @@ -254,6 +254,12 @@ elseif() set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE) endif() +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake new file mode 100644 index 00000000000..928f573a4fb --- /dev/null +++ b/cmake/FindGperftools.cmake @@ -0,0 +1,63 @@ +# Tries to find Gperftools. +# +# Usage of this module as follows: +# +# find_package(Gperftools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Gperftools_ROOT_DIR Set this variable to the root installation of +# Gperftools if the module has problems finding +# the proper installation path. +# +# Variables defined by this module: +# +# GPERFTOOLS_FOUND System has Gperftools libs/headers +# GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) +# GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers + +find_library(GPERFTOOLS_TCMALLOC + NAMES tcmalloc + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_PROFILER + NAMES profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER + NAMES tcmalloc_and_profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_path(GPERFTOOLS_INCLUDE_DIR + NAMES gperftools/heap-profiler.h + HINTS ${Gperftools_ROOT_DIR}/include) + +set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Gperftools + DEFAULT_MSG + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +mark_as_advanced( + Gperftools_ROOT_DIR + GPERFTOOLS_TCMALLOC + GPERFTOOLS_PROFILER + GPERFTOOLS_TCMALLOC_AND_PROFILER + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +# create IMPORTED targets +if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) + add_library(gperftools::tcmalloc UNKNOWN IMPORTED) + set_target_properties(gperftools::tcmalloc PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") + add_library(gperftools::profiler UNKNOWN IMPORTED) + set_target_properties(gperftools::profiler PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") +endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 312fbaa0b3d..a8b9dcfcf5e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + # find all third_party modules is used for paddle static library # for reduce the dependency when building the inference libs. set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -274,6 +282,7 @@ function(cc_library TARGET_NAME) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) endif() # cpplint code style @@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endfunction(cc_binary) @@ -362,6 +372,7 @@ function(cc_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(nv_binary) @@ -433,6 +445,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) @@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(hip_binary) @@ -518,6 +532,7 @@ function(hip_test TARGET_NAME) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) @@ -560,6 +575,7 @@ function(go_library TARGET_NAME) endif() if(go_library_DEPS) add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) endif(go_library_DEPS) # The "source file" of the library is `${dummyfile}` which never diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee772..9355bb572b5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -30,13 +30,33 @@ limitations under the License. */ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef WITH_GPERFTOOLS +#include "google/gperftools.h" +#endif +DEFINE_string(PEProfileFName, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); + namespace paddle { namespace framework { - +static std::once_flag gProfileOnce; +static bool gProfileStarted = false; class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places) {} + : places_(places) { + if (!FLAGS_PEProfileFName.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_PEProfileFName.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_PEProfileFName will be ignored"; +#endif + }); + } + } ~ParallelExecutorPrivate() { if (own_local_scope_) { @@ -270,6 +290,12 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + platform::RecordBlock b(0); #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { -- GitLab From 6bc0efb489411bb1b3206db0cbb03951811fa988 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 10:53:14 +0800 Subject: [PATCH 0093/2367] refine interface --- python/paddle/fluid/async_executor.py | 42 +++++++++++++------- python/paddle/fluid/distributed/downpour.py | 12 ++++-- python/paddle/fluid/distributed/helper.py | 30 ++++++++------ python/paddle/fluid/distributed/node.py | 44 +++++++++++++++++---- python/paddle/fluid/distributed/ps_pb2.py | 6 +-- 5 files changed, 93 insertions(+), 41 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index f667ff24246..3451d1edb54 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -24,6 +24,7 @@ from paddle.fluid.proto import data_feed_pb2 from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc +from .distributed import ps_instance __all__ = ['AsyncExecutor'] @@ -85,6 +86,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) + self.instance = ps_instance.PaddlePSInstance("init_param", 1, 2) def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): """ @@ -149,27 +151,39 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + self.instance.barrier_all() def config_distributed_nodes(self, dist_opt): + # get total rank # get rank index # get iplists # get hadoop info - return - - - def init_server(self, filename, index): - self.executor.init_server(filename, index) - - def init_worker(self, filename, ips, nodes_cnt, index): - self.executor.init_worker(filename, ips, nodes_cnt, index) + pass + + def get_instance(self): + return self.instance + + def init_server(self, dist_desc): + self.executor.init_server(dist_desc, self.instance._rankid) + ip = self.executor.start_server() + self.instance.set_ip(ip) + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.gather_servers(ips, self.instance.get_node_cnt()) + self.instance.barrier_all() #wait all worker start + self.instance.barrier_all() #wait init model + self.instance.barrier_all() #wait worker do all things + + def init_worker(self, dist_desc): + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) + self.instance.barrier_all() #wait all worker start + if self.instance.is_first_worker(): + self.executor.init_model() + self.instance.barrier_all() #wait init model - def start_server(self): - return self.executor.start_server() - - def gather_servers(self, ips, nodes_cnt): - self.executor.gather_servers(ips, nodes_cnt) - def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 3d940b62b01..654fa6fab6f 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -46,14 +46,20 @@ class DownpourSGD(object): sparse_table_index = 0 # currently merge all dense parameters into one dense table dense_table_index = 1 + params = [] + grads = [] + for i in params_grads: + params.append(i[0]) + for i in params_grads: + grads.append(i[1]) server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) server.add_dense_table(dense_table_index, self.learning_rate_, - params_grads[0], params_grads[1]) + params, grads) worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) worker.add_dense_table(dense_table_index, self.learning_rate_, - params_grads[0], params_grads[1]) + params, grads) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc()) @@ -61,4 +67,4 @@ class DownpourSGD(object): # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops] + return [ps_param, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 12e2f7f197a..4cc5eb2a920 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,4 +1,5 @@ from mpi4py import MPI +import ps_pb2 as pslib class FileSystem(object): def __init__(self, fs_type="afs", @@ -7,20 +8,23 @@ class FileSystem(object): passwd=None, hadoop_bin="", afs_conf=None): - assert user not None - assert passwd not None - assert hadoop_bin not None - fs_client = pslib.FsClientParameter() - if fs_type == "afs": - fs_client.fs_type = pslib.FsApiType.AFS - else: - fs_client.fs_type = pslib.FsApiType.HDFS - fs_client.uri = uri - fs_client.user = user - fs_client.passwd = passwd - fs_client.buffer_size = 0 - fs_client.afs_conf = afs_conf if not afs_conf else "" + assert user != None + assert passwd != None + assert hadoop_bin != None + self.fs_client = pslib.FsClientParameter() + #if fs_type == "afs": + # fs_client.fs_type = pslib.FsApiType.AFS + #else: + # fs_client.fs_type = pslib.FsApiType.HDFS + self.fs_client.uri = uri + self.fs_client.user = user + self.fs_client.passwd = passwd + #self.fs_client.buffer_size = 0 + self.fs_client.hadoop_bin = hadoop_bin + #self.fs_client.afs_conf = afs_conf if not afs_conf else "" + def get_desc(self): + return self.fs_client class MPIHelper(object): def __init__(self): diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index b96a15a32fd..c245dc4db8d 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -13,24 +13,52 @@ class Worker(object): class DownpourServer(Server): def __init__(self): self.server_ = pslib.ServerParameter() + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" + self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" + self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_thread_num = 12 def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id + table.table_class = "DownpourSparseTable" table.type = pslib.PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" - table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, - slot_value_var[0].shape, 1)) + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.sparse_sgd_param.initial_g2sum = 3 + table.accessor.sparse_sgd_param.initial_range = 1e-4 + table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) + + table.accessor.embedx_dim = 8 + table.accessor.embedx_threshold = 5 + table.accessor.fea_dim = 11 + #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, + # slot_value_var[0].shape, 1)) + table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 + table.accessor.downpour_accessor_param.click_coeff = 2 + table.accessor.downpour_accessor_param.base_threshold = 0.2 + table.accessor.downpour_accessor_param.delta_threshold = 0.15 + table.accessor.downpour_accessor_param.delta_keep_days = 31 + table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 + table.accessor.downpour_accessor_param.delete_threshold = 0.8 def add_dense_table(self, table_id, learning_rate, param_var, grad_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id + table.table_class = "DownpourDenseTable" table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" - table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 + table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 + table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 for param in param_var: fea_dim += reduce(lambda x, y: x * y, param.shape, 1) @@ -44,8 +72,8 @@ class DownpourWorker(Worker): def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() - self.worker_.pull_dense_per_batch = window - self.worker_.push_dense_per_batch = window + #self.worker_.pull_dense_per_batch = window + #self.worker_.push_dense_per_batch = window def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_vars): @@ -62,8 +90,8 @@ class DownpourWorker(Worker): param_vars, grad_vars): table = self.worker_.dense_table.add() table.table_id = table_id - table.dense_variable_name.extend([p.name for p in param_vars]) - table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) + table.dense_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [g.name for g in grad_vars])) def get_desc(self): return self.worker_ diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index f33ec50f7d2..b82c649e143 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -531,21 +531,21 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( _descriptor.FieldDescriptor( name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusBrpcPsServer").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, number=2, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusBrpcPsClient").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, number=3, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusPsService").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), -- GitLab From 86e1044ab941d627362d0def4ad45a250178a736 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 10:54:25 +0800 Subject: [PATCH 0094/2367] refine interface & add ps_instance --- .../paddle/fluid/distributed/ps_instance.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 python/paddle/fluid/distributed/ps_instance.py diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py new file mode 100644 index 00000000000..b4045327e1b --- /dev/null +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -0,0 +1,108 @@ +#import paddle.fluid.distributed.helper as dist_helper +import helper as dist_helper +import sys +#from mpi4py import MPI + + +class PaddlePSInstance(object): + def __init__(self, init_param, server_worker_mode, proc_per_node): + self.dh = dist_helper.MPIHelper() + self._config = init_param + self._rankid = self.dh.get_rank() + self._server_worker_mode = server_worker_mode + self._proc_per_node = proc_per_node + self._nodes = self.dh.get_size() + + self._ip = 0 + self._worker_num = self._nodes * self._proc_per_node / 2 + self._server_num = self._nodes * self._proc_per_node / 2 + self._total_server_worker = self._worker_num + self._server_num + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._set_nodetype() + self._comm = None + self._split_comm() + + + def _set_nodetype(self): + if self._server_worker_mode == 0: + if self._rankid < self._server_num: + self._node_type = 1 + elif self._rankid < self._total_server_worker: + self._node_type = 0 + else: + self._node_type = -1 + elif self._server_worker_mode == 1: + if self._rankid < self._total_server_worker: + if 0 == self._rankid % self._proc_per_node % 2: + self._node_type = 0 + else: + self._node_type = 1 + else: + self._node_type = -1; + else: + self._node_type = -1 + + #if self._rankid == 0: + #print "node type: ", self._node_type + + def _split_comm(self): + if self.is_server(): + self._comm = self.dh.comm.Split(self._node_type) + elif self.is_worker(): + self._comm = self.dh.comm.Split(self._node_type) + pass + + def get_worker_index(self): + if self._server_worker_mode == 0: + return self._rankid == self.server_num + else: + return self._rankid / self._proc_per_node + + def get_server_index(self): + if self._server_worker_mode == 0: + return self.rank_id + else: + return self.rank_id / self._proc_per_node + + def is_worker(self): + return self._node_type == 1 + + def is_server(self): + return self._node_type == 0 + + def is_first_worker(self): + return self.is_worker() and 0 == self.get_worker_index() + + def set_ip(self, ip): + self._ip = ip + + def gather_ips(self): + self._ips = self.dh.comm.allgather(self._ip) + return self._ips + + def get_node_cnt(self): + return self._nodes + + def barrier_all(self): + #print self._rankid, "begin" + #sys.stdout.flush() + self.dh.comm.barrier() + #print self._rankid, "end" + + def barrier_worker(self): + if self.is_worker(): + #print "worker: ", self._rankid, "begin" + #sys.stdout.flush() + self._comm.barrier() + #print "worker: ", self._rankid, "end" + pass + + def finalize(self): + pass + + +if __name__ == "__main__": + instance = PaddlePSInstance(1, 1, 2, 50) + instance.barrier_all() + #print "-----" + #instance.barrier_worker() -- GitLab From 66182abda6e1451fe4719d8a825a0dcd33cf0339 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 10 Dec 2018 03:16:00 +0000 Subject: [PATCH 0095/2367] add cuda cudnn version check test=develop --- paddle/fluid/platform/device_context.cc | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 146a205832f..bd81d4dd1f1 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -220,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << (cudnn_dso_ver % 100) / 10 << "."; + + { + // Check CUDA/CUDNN version compatiblity + auto local_cuda_version = runtime_version_ / 100; + auto compile_cuda_version = CUDA_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDA " + << compile_cuda_version / 10 << "." << compile_cuda_version % 10 + << ", but CUDA runtime version in your machine is " + << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDA " + "version."; + } + + if (dynload::HasCUDNN()) { + auto local_cudnn_version = cudnn_dso_ver / 100; + auto compile_cudnn_version = CUDNN_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDNN " + << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 + << ", but CUDNN version in your machine is " + << local_cudnn_version / 10 << "." << local_cudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDNN " + "version."; + } + } + } + callback_manager_.reset(new StreamCallbackManager(stream_)); } -- GitLab From 57557f677476d75a7b251081e97606499255a0c7 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 11:33:00 +0800 Subject: [PATCH 0096/2367] fix scope in nce and prefetch --- .../operators/distributed/parameter_prefetch.cc | 13 ++++++------- paddle/fluid/operators/nce_op.h | 13 ++++--------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index f6a2d5bbe52..4cdeae81a10 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, - const framework::Scope& actual_scope, framework::Scope* scope, + const framework::ExecutionContext& context, framework::Scope* scope, platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); @@ -115,9 +114,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = actual_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); auto* out_tensor = - actual_scope.FindVar(out_name)->GetMutable(); + scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -175,7 +174,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = context.scope().NewScope(); + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -247,8 +246,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, scope, &local_scope, &actual_ctx); - context.scope().DeleteScope(&local_scope); + context, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2e51c67401f..862064be182 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,7 +170,7 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids"); + auto *ids = local_scope.Var("Ids@Local"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +179,10 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local") - ->GetMutable() - ->mutable_data(context.GetPlace()); + local_scope.Var("Weight@Local"); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight@Local", table_names, + operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, epmap, height_sections, context, &local_scope); #else @@ -207,10 +205,7 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - - if (context.scope().HasKid(&local_scope)) { - context.scope().DeleteScope(&local_scope); - } + context.scope().DeleteScope(&local_scope); } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); -- GitLab From 3a3b5e50085d5384f97e0112ab0bc0c45702fdc4 Mon Sep 17 00:00:00 2001 From: Cheerego <35982308+shanyi15@users.noreply.github.com> Date: Mon, 10 Dec 2018 11:55:10 +0800 Subject: [PATCH 0097/2367] update Readme to 1.2 test=develop --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 56d6c10c642..c535e9514e1 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1) +### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.1.0.post87 +pip install paddlepaddle-gpu==1.2.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.1.0.post85 +pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) We appreciate your contributions! -- GitLab From c9a653820bc2bfaa0a47b67916a445ceaa7abdad Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 12:20:34 +0800 Subject: [PATCH 0098/2367] fix label_pos ,add test_layers.py, test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 35 +++++++++---------- paddle/fluid/operators/bpr_loss_op.h | 18 +++++----- python/paddle/fluid/layers/nn.py | 17 ++++++--- .../fluid/tests/unittests/test_bpr_loss_op.py | 9 +++-- .../fluid/tests/unittests/test_layers.py | 9 +++++ 5 files changed, 51 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 075b1b2c765..9258d7c7e83 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -23,18 +23,17 @@ class BprLossOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("LabelPos"), - "Input(LabelPos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("LabelPos"); + auto label_dims = ctx->GetInputDim("Label"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, label_pos_dims.size(), - "Input(X) and Input(LabelPos) shall have the same rank."); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_pos_dims, 0, rank - 1), - "Input(X) and Input(LabelPos) shall have the same shape " + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " "except the last dimension."); auto y_dims = x_dims; @@ -60,25 +59,23 @@ class BprLossGradientOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("LabelPos"), - "Input(LabelPos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("LabelPos"); + auto label_dims = ctx->GetInputDim("Label"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); PADDLE_ENFORCE_EQ(dy_dims.size(), rank, "Input(Y@Grad) and Input(X) should have the same rank."); - PADDLE_ENFORCE_EQ( - label_pos_dims.size(), rank, - "Input(LabelPos) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_pos_dims, 0, rank - 1), - "The Input(X) and Input(LabelPos) should have the same " + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " "shape except the last dimension."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(dy_dims, 0, rank - 1), @@ -86,8 +83,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel { "shape except the last dimension."); PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, "The last dimension of Input(Y@Grad) should be 1."); - PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, - " the last dimension of Input(LabelPos) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + " the last dimension of Input(Label) should be 1."); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->ShareLoD("X", framework::GradVarName("X")); } @@ -111,7 +108,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "size is equal to the number of classes. This input is a " "real number."); AddInput( - "LabelPos", + "Label", "(Tensor), the tensor which represents the ground truth. It has the " "same shape with 'X' except the last dimension. the last dimension " "size is 1."); @@ -122,7 +119,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Bayesian Personalized Ranking Loss Operator. -This operator belongs to pairwise ranking loss. LabelPos is the desired item. +This operator belongs to pairwise ranking loss. Label is the desired item. The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index ab68165942a..e223be7af82 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -41,17 +41,17 @@ class BprLossOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* label_pos = ctx.Input("LabelPos"); + auto* label = ctx.Input("Label"); auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); int rank = x->dims().size(); Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); - Tensor labels_Pos_2d = framework::ReshapeToMatrix(*label_pos, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1); Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); const framework::Tensor* logits = &x_2d; - const framework::Tensor* labels_pos = &labels_Pos_2d; + const framework::Tensor* labels = &labels_2d; framework::Tensor* out = &y_2d; const int step_size = logits->dims()[0]; @@ -59,9 +59,9 @@ class BprLossOpKernel : public framework::OpKernel { const T* logits_data = logits->data(); T* loss_data = out->data(); - const int64_t* label_pos_data = labels_pos->data(); + const int64_t* label_data = labels->data(); for (int i = 0; i < step_size; ++i) { - int lbl_pos = label_pos_data[i]; + int lbl_pos = label_data[i]; PADDLE_ENFORCE_GE(lbl_pos, 0); PADDLE_ENFORCE_LT(lbl_pos, class_num); int index_pos = i * class_num + lbl_pos; @@ -84,7 +84,7 @@ class BprLossGradientOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* label_pos = ctx.Input("LabelPos"); + auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); const int step_size = x->dims()[0]; @@ -92,16 +92,16 @@ class BprLossGradientOpKernel : public framework::OpKernel { T* dx_data = dx->mutable_data(ctx.GetPlace()); const T* dy_data = dy->data(); const T* x_data = x->data(); - const int64_t* label_pos_data = label_pos->data(); + const int64_t* label_data = label->data(); for (size_t sample_id = 0; sample_id < step_size; sample_id++) { for (size_t x_offset = sample_id * num_classes; x_offset < (sample_id + 1) * num_classes; x_offset++) { dx_data[x_offset] = static_cast(0); } - auto p_index = sample_id * num_classes + label_pos_data[sample_id]; + auto p_index = sample_id * num_classes + label_data[sample_id]; for (size_t ni = 0; ni < num_classes; ni++) { - if (label_pos_data[sample_id] == ni) continue; + if (label_data[sample_id] == ni) continue; auto n_index = sample_id * num_classes + ni; auto grad_ = -dy_data[sample_id] / ((num_classes - 1) * diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9233fe130e8..04582acf6b8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1349,21 +1349,30 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out -def bpr_loss(input, label_pos): +def bpr_loss(input, label): """ Bayesian Personalized Ranking Loss Operator. - This operator belongs to pairwise ranking loss. LabelPos is the desired item. + This operator belongs to pairwise ranking loss. Label is the desired item. The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j(https://arxiv.org/abs/1511.06939) + Args: + input (Variable|list): a 2-D tensor with shape [N x D], where N is the + batch size and D is the number of classes. + This input is not probability but logits. + label (Variable|list): the ground truth which is a 2-D tensor. `label` + is a tensor with shape [N x 1]. + Returns: + A 2-D tensor with shape [N x 1], the bpr loss. + Examples: .. code-block:: python - cost = fluid.layers.bpr_loss(input=predict, label_pos=label) + cost = fluid.layers.bpr_loss(input=predict, label=label) """ helper = LayerHelper('bpr_loss', **locals()) @@ -1371,7 +1380,7 @@ def bpr_loss(input, label_pos): helper.append_op( type='bpr_loss', inputs={'X': [input], - 'LabelPos': [label_pos]}, + 'Label': [label]}, outputs={'Y': [out]}) return out diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 80916f4a828..c8dc5fbd237 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -28,18 +28,17 @@ class TestBprLossOp1(OpTest): batch_size = 40 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') - label_pos = np.random.randint( - 0, class_num, (batch_size, 1), dtype="int64") + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") bpr_loss_result = [] for i in range(batch_size): sum = 0.0 for j in range(class_num): - if j == label_pos[i][0]: + if j == label[i][0]: continue - sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) + sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]]))) bpr_loss_result.append(-sum / (class_num - 1)) bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") - self.inputs = {"X": X, "LabelPos": label_pos} + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": bpr_loss} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index be51fb06a37..10e8bb5a866 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -846,6 +846,15 @@ class TestBook(unittest.TestCase): out = layers.cross_entropy(x, label, False, 4) self.assertIsNotNone(out) + def test_bpr_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + out = layers.bpr_loss(x, label) + self.assertIsNotNone(out) + print(str(program)) + def test_expand(self): program = Program() with program_guard(program): -- GitLab From 271c4808220eff0d8d5d4c9456386c922cd5f2d6 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 12:33:48 +0800 Subject: [PATCH 0099/2367] update API, test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a61b93af357..3a422dcb336 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -66,7 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label_pos'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) -- GitLab From 019e8bbed2e5045778a95962d76fcd2c39044cc3 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 10 Dec 2018 05:47:39 +0000 Subject: [PATCH 0100/2367] fix comments test=develop --- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 768318fb063..1d0d83d1f36 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -109,6 +109,10 @@ class Pool2dOpConverter : public OpConverter { } if (pool_type == "max") { + // Under ceil mode, the pre_pad and post_pad are used to + // record the the padding size. In some ceil mode cases, + // we do not need padding, so we initialize the two vars to 0. + nvinfer1::DimsHW pre_pad(0, 0); nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { -- GitLab From abf140289fbc84edd160851cb0f63de75dcf2965 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 10 Dec 2018 13:07:35 +0800 Subject: [PATCH 0101/2367] split selected rows op should always init output selected rows test=develop --- paddle/fluid/operators/split_selected_rows_op.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index af64607fafc..1fef2b3d378 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { for (size_t i = 0; i < outs_rows_idx.size(); ++i) { auto rows_idx = outs_rows_idx[i]; outs[i]->set_height(height_sections[i]); + auto dims = x->GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, x->place()); + outs[i]->mutable_rows()->clear(); if (rows_idx.size() > 0) { - auto dims = x->GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[i]->mutable_value()->mutable_data(dims, x->place()); for (auto idx : rows_idx) { outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); } @@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { } } } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); } } }; -- GitLab From 90c7f9870e76a398f97d1d510c662a75d373881f Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 14:36:52 +0800 Subject: [PATCH 0102/2367] fix 'name', test=develop --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/layers/nn.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 3a422dcb336..fd4cf92d85d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -66,7 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 04582acf6b8..e25eaaa9fda 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1349,7 +1349,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out -def bpr_loss(input, label): +def bpr_loss(input, label, name=None): """ Bayesian Personalized Ranking Loss Operator. @@ -1366,6 +1366,8 @@ def bpr_loss(input, label): This input is not probability but logits. label (Variable|list): the ground truth which is a 2-D tensor. `label` is a tensor with shape [N x 1]. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. Default: None. Returns: A 2-D tensor with shape [N x 1], the bpr loss. -- GitLab From 9623b45f40b3d382d4db6ee39daff04f1d9d33ab Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Mon, 10 Dec 2018 14:37:24 +0800 Subject: [PATCH 0103/2367] Remove unnecessary MKLDNN reorder (#14799) When data flow from a MKLDNN OP kernel to a non-MKLDNN OP kernel, data layout transform (via MKLDNN reorder) will occur even when those two OP kernels share same layout. Add code to remove this unnecessary reorder. test=develop --- .../fluid/framework/data_layout_transform.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index c9e3a8ac1d1..5467f6d1b23 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, auto out_format = platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); - void* in_data = GetDataFromTensor(in, in_type); - // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - - auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); - auto out_memory = - memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + if (in_format != out_format) { + void* in_data = GetDataFromTensor(in, in_type); + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - platform::Reorder(in_memory, out_memory); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + platform::Reorder(in_memory, out_memory); + } else { + out->ShareDataWith(in); + } out->set_layout(out_layout); // reset format since the out tensor will be feed to non-MKLDNN OPkernel out->set_format(memory::format::format_undef); -- GitLab From f5434507f081e02eb37f8f9da17165bd2298348a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 10 Dec 2018 14:36:37 +0800 Subject: [PATCH 0104/2367] fix control_flow ops in outs test=develop --- python/paddle/fluid/framework.py | 29 ++++++++++++++++------ python/paddle/fluid/layers/control_flow.py | 14 ++++++----- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d424..f8e3cd3a320 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1040,19 +1040,15 @@ class Block(object): raise ValueError("var %s not in this block" % name) return v - def _var_recursive(self, name): + def _find_var_recursive(self, name): """ Get a Variable by name from this block recursively. Args: name(str): the Variable's name. - Raises: - ValueError: this block and this parent block doesn't - have a Variable with the giving name. - Returns: - Variable: the Variable with the giving name. + Variable: the Variable with the giving name. Or None if not found. """ frontier = list() visited = set() @@ -1078,8 +1074,27 @@ class Block(object): frontier.append(prog.block(cur.forward_block_idx)) visited.add(id(cur)) + return None - raise ValueError("Var {0} is not found recursively".format(name)) + def _var_recursive(self, name): + """ + Get a Variable by name from this block recursively. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: this block and this parent block doesn't + have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ + var = self._find_var_recursive(name) + if var: + return var + else: + raise ValueError("Var {0} is not found recursively".format(name)) def all_parameters(self): return list(self.iter_parameters()) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 05138bf9459..b7e39685691 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -717,8 +717,9 @@ class While(object): out_vars = [] for inner_out_name in inner_outputs: - if inner_out_name in parent_block.vars: - out_vars.append(parent_block.var(inner_out_name)) + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_vars.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) @@ -1264,10 +1265,11 @@ class ConditionalBlock(object): if each_name not in input_set ] - out_list = [ - parent_block.var(var_name) for var_name in parent_block.vars - if var_name in intermediate - ] + out_list = [] + for inner_out_name in intermediate: + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_list.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) -- GitLab From edd1f5a92b4a96c560e80f477556ee5ef820ac2b Mon Sep 17 00:00:00 2001 From: superjomn Date: Mon, 10 Dec 2018 15:17:10 +0800 Subject: [PATCH 0105/2367] fix visualizer test=develop --- .../inference/analysis/passes/ir_graph_build_pass.cc | 7 ++++--- paddle/fluid/inference/utils/visualizer.cc | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index b8a045c18fa..c6e923c0048 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -44,9 +44,10 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->SetMainProgram(program.release()); } else if (argument->model_program_path_valid() && argument->model_params_path_valid()) { - auto program = - LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place, argument->model_from_memory()); + auto program = LoadModel( + argument->model_program_path(), argument->model_params_path(), + argument->scope_ptr(), place, + argument->model_from_memory_valid() && argument->model_from_memory()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc index 040b6476fb4..7c0dd64dea8 100644 --- a/paddle/fluid/inference/utils/visualizer.cc +++ b/paddle/fluid/inference/utils/visualizer.cc @@ -26,9 +26,6 @@ DEFINE_string(model_dir, "", "model directory"); DEFINE_string(model_program_path, "", "model program path"); DEFINE_string(model_params_path, "", "model params path"); -USE_PASS(graph_viz_pass); -USE_PASS(graph_to_program_pass); - using paddle::inference::analysis::Argument; namespace paddle { @@ -40,7 +37,6 @@ void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } bool Visualizer::Run() { paddle::framework::InitDevices(false); paddle::inference::analysis::Analyzer().Run(argument_); - return true; } @@ -77,7 +73,7 @@ int main(int argc, char *argv[]) { // Only 1 pass, default filename is 0_ir_origin.dot // For more details, looking for paddle::inference::analysis::IRPassManager - argument.SetIrAnalysisPasses({"graph_viz_pass"}); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"}); std::unique_ptr scope{ new paddle::framework::Scope()}; @@ -90,3 +86,7 @@ int main(int argc, char *argv[]) { return 0; } + +USE_PASS(infer_clean_graph_pass); +USE_PASS(graph_viz_pass); +USE_PASS(graph_to_program_pass); -- GitLab From 554bcdbdfcbe68c799dd6de8a01ab0d2337f2975 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Mon, 10 Dec 2018 15:17:49 +0800 Subject: [PATCH 0106/2367] add more log for dist test for ci test=develop (#14813) * add more log for dist test for ci test=develop * increase deadline test=develop --- .../fluid/tests/unittests/test_dist_base.py | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0a43f536585..26fa20291b5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -378,6 +378,18 @@ class TestDistBase(unittest.TestCase): stderr=tr1_pipe, env=env1) + # Wait until trainer process terminate + while True: + stat0 = tr0_proc.poll() + time.sleep(0.1) + if stat0 is not None: + break + while True: + stat1 = tr1_proc.poll() + time.sleep(0.1) + if stat1 is not None: + break + tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() @@ -390,11 +402,21 @@ class TestDistBase(unittest.TestCase): ps0.terminate() ps1.terminate() + # print server log + with open("/tmp/ps0_err.log", "r") as fn: + sys.stderr.write("ps0 stderr: %s\n" % fn.read()) + with open("/tmp/ps1_err.log", "r") as fn: + sys.stderr.write("ps1 stderr: %s\n" % fn.read()) + # print log - sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + if stat0 == 0: + sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) + with open("/tmp/tr0_err.log", "r") as fn: + sys.stderr.write('trainer 0 stderr: %s\n' % fn.read()) + if stat1 == 0: + sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) + with open("/tmp/tr1_err.log", "r") as fn: + sys.stderr.write('trainer 1 stderr: %s\n' % fn.read()) return pickle.loads(tr0_out), pickle.loads(tr1_out) @@ -474,6 +496,7 @@ class TestDistBase(unittest.TestCase): "PYTHONPATH": os.getenv("PYTHONPATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_rpc_deadline": "5000", # 5sec to fail fast "FLAGS_cudnn_deterministic": "1", "http_proxy": "", "NCCL_P2P_DISABLE": "1" -- GitLab From 53709e7e619dbcf243a7421777c103e69c5012ee Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Dec 2018 13:01:38 +0000 Subject: [PATCH 0107/2367] refine names --- paddle/fluid/operators/CMakeLists.txt | 4 +- .../{jitkernels => jit}/CMakeLists.txt | 6 +-- .../operators/{jitkernels => jit}/README.md | 2 +- .../jitcode => jit/gen}/CMakeLists.txt | 0 .../{jitkernels/jitcode => jit/gen}/blas.cc | 17 ++++---- .../{jitkernels/jitcode => jit/gen}/blas.h | 10 ++--- .../jitcode => jit/gen}/jitcode.cc | 10 ++--- .../{jitkernels/jitcode => jit/gen}/jitcode.h | 12 +++--- .../jitcode_base.cc => jit/gen_base.cc} | 8 ++-- .../jitcode_base.h => jit/gen_base.h} | 10 ++--- .../{jitkernels => jit}/kernel_base.h | 4 +- .../{jitkernels => jit}/kernel_key.h | 6 +-- .../{jitkernels => jit}/kernel_pool.cc | 6 +-- .../{jitkernels => jit}/kernel_pool.h | 20 ++++----- .../{jitkernels => jit}/more/CMakeLists.txt | 0 .../more/mkl/CMakeLists.txt | 0 .../{jitkernels => jit}/more/mkl/mkl.cc | 10 ++--- .../{jitkernels => jit}/more/mkl/mkl.h | 6 +-- .../operators/{jitkernels => jit}/more/more.h | 0 .../{jitkernels => jit}/refer/CMakeLists.txt | 0 .../{jitkernels => jit}/refer/refer.cc | 6 +-- .../{jitkernels => jit}/refer/refer.h | 6 +-- .../operators/{jitkernels => jit}/registry.h | 42 +++++++++---------- .../operators/{jitkernels => jit}/test.cc | 6 +-- 24 files changed, 96 insertions(+), 95 deletions(-) rename paddle/fluid/operators/{jitkernels => jit}/CMakeLists.txt (79%) rename paddle/fluid/operators/{jitkernels => jit}/README.md (98%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/blas.cc (90%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/blas.h (93%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/jitcode.cc (79%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/jitcode.h (94%) rename paddle/fluid/operators/{jitkernels/jitcode_base.cc => jit/gen_base.cc} (88%) rename paddle/fluid/operators/{jitkernels/jitcode_base.h => jit/gen_base.h} (90%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_base.h (96%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_key.h (93%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_pool.cc (90%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_pool.h (89%) rename paddle/fluid/operators/{jitkernels => jit}/more/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/mkl.cc (84%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/mkl.h (92%) rename paddle/fluid/operators/{jitkernels => jit}/more/more.h (100%) rename paddle/fluid/operators/{jitkernels => jit}/refer/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/refer/refer.cc (81%) rename paddle/fluid/operators/{jitkernels => jit}/refer/refer.h (91%) rename paddle/fluid/operators/{jitkernels => jit}/registry.h (86%) rename paddle/fluid/operators/{jitkernels => jit}/test.cc (95%) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2808133e844..16ef5c9524e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -16,7 +16,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) -add_subdirectory(jitkernels) +add_subdirectory(jit) if(WITH_DISTRIBUTE) add_subdirectory(distributed) @@ -68,7 +68,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_ten if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt similarity index 79% rename from paddle/fluid/operators/jitkernels/CMakeLists.txt rename to paddle/fluid/operators/jit/CMakeLists.txt index f6bb3e0712f..77fd27666f2 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -14,8 +14,8 @@ cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) if(WITH_XBYAK) - add_subdirectory(jitcode) + add_subdirectory(gen) endif() -cc_library(jit_kernel SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) -cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) +cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jit/README.md similarity index 98% rename from paddle/fluid/operators/jitkernels/README.md rename to paddle/fluid/operators/jit/README.md index fd6428b43ec..12158bf9d03 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -13,7 +13,7 @@ PaddlePaddle/Paddle/paddle/fluid/ │ ├── .../ └── jit/ ├── ... - ├── jitcode/ + ├── gen/ │ └── ... |── more/ │ ├── ... diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt rename to paddle/fluid/operators/jit/gen/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc similarity index 90% rename from paddle/fluid/operators/jitkernels/jitcode/blas.cc rename to paddle/fluid/operators/jit/gen/blas.cc index 2691bee0fdf..4a8b4554c8b 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -11,13 +11,14 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode/blas.h" -#include "paddle/fluid/operators/jitkernels/registry.h" + +#include "paddle/fluid/operators/jit/gen/blas.h" +#include "paddle/fluid/operators/jit/registry.h" namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { void VXXJitCode::genCode() { // do not need push stack, and do not need save avx512reg if do not use avx512 @@ -102,17 +103,17 @@ void VXXJitCode::genCode() { ret(); } -} // namespace jitcode +} // namespace gen template <> -std::unique_ptr CreateJitCode(int attr) { +std::unique_ptr CreateJitCode(int attr) { if (UseJitCode(attr)) { - return make_unique( + return make_unique( attr, CodeSize(attr)); } return nullptr; } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.h b/paddle/fluid/operators/jit/gen/blas.h similarity index 93% rename from paddle/fluid/operators/jitkernels/jitcode/blas.h rename to paddle/fluid/operators/jit/gen/blas.h index a1aca97723e..edc05f86a03 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -15,12 +15,12 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { @@ -82,7 +82,7 @@ class VMulJitCode : public VXXJitCode { : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} }; -} // namespace jitcode -} // namespace jitkernels +} // namespace gen +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc similarity index 79% rename from paddle/fluid/operators/jitkernels/jitcode/jitcode.cc rename to paddle/fluid/operators/jit/gen/jitcode.cc index 8078ace7a84..93204d340e9 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc +++ b/paddle/fluid/operators/jit/gen/jitcode.cc @@ -12,11 +12,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { template <> size_t GetKey(int d) { @@ -24,15 +24,15 @@ size_t GetKey(int d) { } // template <> -// std::shared_ptr CreateJitCode(int attr) +// std::shared_ptr CreateJitCode(int attr) // { // if (UseJitCode(attr)) { -// return std::make_shared>(attr, +// return std::make_shared>(attr, // CodeSize(attr))); // } // return nullptr; // } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h similarity index 94% rename from paddle/fluid/operators/jitkernels/jitcode/jitcode.h rename to paddle/fluid/operators/jit/gen/jitcode.h index 03c2100ca05..52b8da9a82a 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -15,7 +15,7 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/platform/cpu_info.h" #define XBYAK_USE_MMAP_ALLOCATOR @@ -24,8 +24,8 @@ namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { // Application Binary Interface constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), @@ -67,7 +67,7 @@ typedef enum { #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } -class JitCode : public JitBase, public Xbyak::CodeGenerator { +class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { @@ -128,7 +128,7 @@ class JitCode : public JitBase, public Xbyak::CodeGenerator { } }; -} // namespace jitcode -} // namespace jitkernels +} // namespace gen +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jit/gen_base.cc similarity index 88% rename from paddle/fluid/operators/jitkernels/jitcode_base.cc rename to paddle/fluid/operators/jit/gen_base.cc index 1da2af51f41..310da0c76f1 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -12,7 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jit/gen_base.h" #include #include #include @@ -21,10 +21,10 @@ DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // refer do not need useme, it would be the last one. -void JitBase::dumpCode(const unsigned char* code) const { +void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; std::ostringstream filename; @@ -38,6 +38,6 @@ void JitBase::dumpCode(const unsigned char* code) const { } } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jit/gen_base.h similarity index 90% rename from paddle/fluid/operators/jitkernels/jitcode_base.h rename to paddle/fluid/operators/jit/gen_base.h index de8aaf229fe..4a136534dca 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,14 +16,14 @@ #include #include // for shared_ptr -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/macros.h" DECLARE_bool(dump_jitcode); namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // TODO(TJ): make these functions as virtual of a class @@ -43,7 +43,7 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -class JitBase : public Kernel { +class GenBase : public Kernel { public: virtual const char* name() const = 0; virtual const unsigned char* getCodeInternal() = 0; @@ -62,8 +62,8 @@ class JitBase : public Kernel { }; template -std::unique_ptr CreateJitCode(Attr attr); +std::unique_ptr CreateJitCode(Attr attr); -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h similarity index 96% rename from paddle/fluid/operators/jitkernels/kernel_base.h rename to paddle/fluid/operators/jit/kernel_base.h index 6fbb0f9f7ea..6a789c52c37 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -17,7 +17,7 @@ namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; @@ -54,6 +54,6 @@ class ReferKernel : public KernelImpl { bool UseMe(Attr attr) const override { return true; } }; -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h similarity index 93% rename from paddle/fluid/operators/jitkernels/kernel_key.h rename to paddle/fluid/operators/jit/kernel_key.h index e06c2b58dae..af9df77337d 100644 --- a/paddle/fluid/operators/jitkernels/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -13,12 +13,12 @@ * limitations under the License. */ #pragma once -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { struct KernelKey { struct Hash { @@ -44,6 +44,6 @@ struct KernelKey { bool operator!=(const KernelKey& o) const { return !(*this == o); } }; -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc similarity index 90% rename from paddle/fluid/operators/jitkernels/kernel_pool.cc rename to paddle/fluid/operators/jit/kernel_pool.cc index 9bb0ba349bc..f300d28a6f0 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.cc +++ b/paddle/fluid/operators/jit/kernel_pool.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" #include // for shared_ptr #include #include namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; @@ -31,6 +31,6 @@ ReferKernelPool& ReferKernelPool::Instance() { return g_refer_kernel_pool; } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h similarity index 89% rename from paddle/fluid/operators/jitkernels/kernel_pool.h rename to paddle/fluid/operators/jit/kernel_pool.h index 901a891cb38..737b7f60e3c 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -18,19 +18,19 @@ #include #include #include -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_key.h" +#include "paddle/fluid/operators/jit/gen_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_key.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { template class JitCodePool { - typedef std::unique_ptr JitBasePtr; - typedef std::unordered_map JitBaseMap; + typedef std::unique_ptr GenBasePtr; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -39,16 +39,16 @@ class JitCodePool { return g_jit_codes; } - const JitBaseMap& AllKernels() { return codes_; } + const JitCodeMap& AllKernels() { return codes_; } bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, JitBasePtr value) { + void Insert(size_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } private: - JitBaseMap codes_; + JitCodeMap codes_; DISABLE_COPY_AND_ASSIGN(JitCodePool); }; @@ -146,6 +146,6 @@ const Func Get(Attr attr) { return GetRefer(); } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/more/CMakeLists.txt rename to paddle/fluid/operators/jit/more/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt rename to paddle/fluid/operators/jit/more/mkl/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc similarity index 84% rename from paddle/fluid/operators/jitkernels/more/mkl/mkl.cc rename to paddle/fluid/operators/jit/more/mkl/mkl.cc index 88a7d661940..0ffe1d565f1 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -12,13 +12,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/more/mkl/mkl.h" -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/more/mkl/mkl.h" +#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/dynload/mklml.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace more { namespace mkl { @@ -34,11 +34,11 @@ void VMul(const double* x, const double* y, double* z, int n) { } // namespace mkl } // namespace more -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle -namespace mkl = paddle::operators::jitkernels::more::mkl; +namespace mkl = paddle::operators::jit::more::mkl; REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, mkl::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h similarity index 92% rename from paddle/fluid/operators/jitkernels/more/mkl/mkl.h rename to paddle/fluid/operators/jit/more/mkl/mkl.h index 9cf032db43f..45cfec1c477 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -15,12 +15,12 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace more { namespace mkl { @@ -43,6 +43,6 @@ class VMulKernel : public KernelImpl::func_type, } // namespace mkl } // namespace more -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/more.h b/paddle/fluid/operators/jit/more/more.h similarity index 100% rename from paddle/fluid/operators/jitkernels/more/more.h rename to paddle/fluid/operators/jit/more/more.h diff --git a/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/refer/CMakeLists.txt rename to paddle/fluid/operators/jit/refer/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc similarity index 81% rename from paddle/fluid/operators/jitkernels/refer/refer.cc rename to paddle/fluid/operators/jit/refer/refer.cc index dbccac896c5..a987b5fca09 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -12,10 +12,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/refer/refer.h" -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/refer/refer.h" +#include "paddle/fluid/operators/jit/registry.h" -namespace refer = paddle::operators::jitkernels::refer; +namespace refer = paddle::operators::jit::refer; REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, refer::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h similarity index 91% rename from paddle/fluid/operators/jitkernels/refer/refer.h rename to paddle/fluid/operators/jit/refer/refer.h index 796f58d4017..76a663633d1 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,12 +13,12 @@ * limitations under the License. */ #pragma once -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace refer { template @@ -36,6 +36,6 @@ class VMulKernel : public ReferKernel::func_type, }; } // namespace refer -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jit/registry.h similarity index 86% rename from paddle/fluid/operators/jitkernels/registry.h rename to paddle/fluid/operators/jit/registry.h index 6d817461bec..c1f02d9cd57 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,14 +17,14 @@ #include #include #include -#include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/variant.h" // for UNUSED namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // make_unique is supported since c++14 template @@ -76,21 +76,21 @@ class JitKernelRegistrar { msg) // Refer always on CPUPlace -#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ - "REGISTER_KERNEL_REFER must be called in global namespace"); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::operators::jitkernels::ReferKernelPool, \ - ::paddle::platform::CPUPlace, __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ - ::paddle::operators::jitkernels::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ - return 0; \ +#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ + "REGISTER_KERNEL_REFER must be called in global namespace"); \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::ReferKernelPool, ::paddle::platform::CPUPlace, \ + __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ + ::paddle::operators::jit::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ + return 0; \ } -// kernel_type: should be in paddle::operators::jitkernels::KernelType +// kernel_type: should be in paddle::operators::jit::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform #define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ @@ -99,11 +99,11 @@ class JitKernelRegistrar { extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::operators::jitkernels::KernelPool, \ - ::paddle::platform::place_type, __VA_ARGS__> \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::KernelPool, ::paddle::platform::place_type, \ + __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ - ::paddle::operators::jitkernels::KernelType::kernel_type); \ + ::paddle::operators::jit::KernelType::kernel_type); \ int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ .Touch(); \ @@ -139,6 +139,6 @@ class JitKernelRegistrar { #define USE_JITKERNEL_MORE(kernel_type, impl_type) \ USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jit/test.cc similarity index 95% rename from paddle/fluid/operators/jitkernels/test.cc rename to paddle/fluid/operators/jit/test.cc index d27b5d1cbae..836b6eee800 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,9 +19,9 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" // TODO(TJ): remove me -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" @@ -66,7 +66,7 @@ TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jitkernels; + namespace jit = paddle::operators::jit; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { auto ref = jit::GetRefer::func_type, -- GitLab From afc51e6f8272aef314e3bd791195fc887687aacf Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 10 Dec 2018 07:37:01 +0000 Subject: [PATCH 0108/2367] add benchmark for trt --- paddle/fluid/inference/tests/api/CMakeLists.txt | 3 ++- paddle/fluid/inference/tests/api/tester_helper.h | 16 ++++++++++++++-- .../inference/tests/api/trt_models_tester.cc | 3 +++ paddle/fluid/inference/utils/benchmark.cc | 2 +- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index a07626a1031..6901aac3c34 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,7 +1,8 @@ set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} +ir_pass_manager analysis_predictor benchmark) endif() function(download_model install_dir model_name) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d572ea0177c..8209a049f46 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -30,8 +30,10 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/inference/utils/benchmark.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_int32(batch_size, 1, "batch size."); @@ -40,6 +42,8 @@ DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); +DEFINE_bool(record_benchmark, false, + "Record benchmark after profiling the model"); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -192,8 +196,16 @@ void TestOneThreadPrediction( predictor->Run(inputs[j], outputs, batch_size); } } - PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times, - inputs.size()); + + double latency = run_timer.toc() / num_times; + PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(latency); + benchmark.PersistToFile("benchmark_record.txt"); + } } } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index ef612ce6148..9eb3fb5da10 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -135,6 +135,9 @@ TEST(TensorRT_resnext50, compare) { TEST(TensorRT_resnext50, profile) { std::string model_dir = FLAGS_infer_model + "/resnext50"; + // Set FLAGS_record_benchmark to true to record benchmark to file. + // FLAGS_record_benchmark=true; + FLAGS_model_name = "resnext50"; profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc index d03aa11b75e..0bd526bcac2 100644 --- a/paddle/fluid/inference/utils/benchmark.cc +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -30,7 +30,7 @@ std::string Benchmark::SerializeToString() const { ss << '\n'; ss << name_ << "\t"; - ss << batch_size_ << "\t"; + ss << batch_size_ << "\t\t"; ss << num_threads_ << "\t"; ss << latency_ << "\t"; ss << 1000.0 / latency_; -- GitLab From b22d638d8fa9168a49dccffef379218eb8c85c92 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 10 Dec 2018 16:18:06 +0800 Subject: [PATCH 0109/2367] Speed up SizeOfType test=develop --- paddle/fluid/framework/data_type.cc | 15 ++-- paddle/fluid/framework/parallel_executor.cc | 10 +-- paddle/fluid/platform/enforce.h | 80 +++++++++++---------- python/paddle/fluid/__init__.py | 3 +- 4 files changed, 57 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 28f3da88fa1..1c29a89bffa 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/data_type.h" #include +#include #include #include @@ -23,10 +24,10 @@ namespace paddle { namespace framework { struct DataTypeMap { - std::unordered_map cpp_to_proto_; + std::map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; + std::map cpp_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -43,9 +44,9 @@ static inline void RegisterType(DataTypeMap* map, proto::VarType::Type proto_type, const std::string& name) { map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); - map->cpp_to_proto_.emplace(typeid(T), proto_type); + map->cpp_to_proto_.emplace(typeid(T).name(), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T), sizeof(T)); + map->cpp_to_size_.emplace(typeid(T).name(), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -71,7 +72,7 @@ static DataTypeMap* InitDataTypeMap() { } proto::VarType::Type ToDataType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type); + auto it = gDataTypeMap().cpp_to_proto_.find(type.name()); if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } @@ -97,8 +98,8 @@ std::string DataTypeToString(const proto::VarType::Type type) { } size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type); - if (it != gDataTypeMap().cpp_to_size_.end()) { + auto it = gDataTypeMap().cpp_to_size_.find(type.name()); + if (LIKELY(it != gDataTypeMap().cpp_to_size_.end())) { return it->second; } PADDLE_THROW("Not support %s as tensor type", type.name()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 9355bb572b5..0636b89048f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -31,9 +31,9 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #ifdef WITH_GPERFTOOLS -#include "google/gperftools.h" +#include "gperftools/profiler.h" #endif -DEFINE_string(PEProfileFName, "", +DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); @@ -45,14 +45,14 @@ class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) : places_(places) { - if (!FLAGS_PEProfileFName.empty()) { + if (!FLAGS_pe_profile_fname.empty()) { std::call_once(gProfileOnce, [] { #ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_PEProfileFName.c_str()); + ProfilerStart(FLAGS_pe_profile_fname.c_str()); gProfileStarted = true; #else LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_PEProfileFName will be ignored"; + "FLAGS_pe_profile_fname will be ignored"; #endif }); } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a85972bdb72..01ee67fd07f 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -62,45 +62,54 @@ inline std::string demangle(std::string name) { return name; } #endif struct EnforceNotMet : public std::exception { - std::exception_ptr exp_; std::string err_str_; - EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { - static constexpr int TRACE_STACK_LIMIT = 100; + EnforceNotMet(std::exception_ptr e, const char* f, int l) { try { - std::rethrow_exception(exp_); - } catch (const std::exception& exp) { - std::ostringstream sout; + std::rethrow_exception(e); + } catch (std::exception& e) { + Init(e.what(), f, l); + } + } - sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; - sout << "PaddlePaddle Call Stacks: " << std::endl; + template + EnforceNotMet(const char* f, int l, ARGS... args) { + Init(string::Sprintf(args...), f, l); + } + + const char* what() const noexcept override { return err_str_.c_str(); } + + private: + template + inline void Init(StrType what, const char* f, int l) { + static constexpr int TRACE_STACK_LIMIT = 100; + std::ostringstream sout; + + sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl; + sout << "PaddlePaddle Call Stacks: " << std::endl; #if !defined(_WIN32) - void* call_stack[TRACE_STACK_LIMIT]; - auto size = backtrace(call_stack, TRACE_STACK_LIMIT); - auto symbols = backtrace_symbols(call_stack, size); - - Dl_info info; - for (int i = 0; i < size; ++i) { - if (dladdr(call_stack[i], &info) && info.dli_sname) { - auto demangled = demangle(info.dli_sname); - auto addr_offset = static_cast(call_stack[i]) - - static_cast(info.dli_saddr); - sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, - 2 + sizeof(void*) * 2, call_stack[i], - demangled, addr_offset); - } else { - sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, - call_stack[i]); - } + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + Dl_info info; + for (int i = 0; i < size; ++i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + auto addr_offset = static_cast(call_stack[i]) - + static_cast(info.dli_saddr); + sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, + 2 + sizeof(void*) * 2, call_stack[i], demangled, + addr_offset); + } else { + sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, + call_stack[i]); } - free(symbols); + } + free(symbols); #else - sout << "Windows not support stack backtrace yet."; + sout << "Windows not support stack backtrace yet."; #endif - err_str_ = sout.str(); - } + err_str_ = sout.str(); } - - const char* what() const noexcept { return err_str_.c_str(); } }; struct EOFException : public std::exception { @@ -242,13 +251,8 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#define PADDLE_THROW(...) \ - do { \ - throw ::paddle::platform::EnforceNotMet( \ - std::make_exception_ptr( \ - std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ - __FILE__, __LINE__); \ - } while (false) +#define PADDLE_THROW(...) \ + throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(...) \ diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 52417a1eaf7..a532f94c6dd 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,7 +127,8 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') -- GitLab From 202b2f1fa71b33b5165e166ecdde0163a9799bdb Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 10 Dec 2018 17:27:20 +0800 Subject: [PATCH 0110/2367] Move the beta pow scale calculation into Adam Op --- paddle/fluid/framework/ir/graph.cc | 98 ++++++++++----------- paddle/fluid/operators/optimizers/adam_op.h | 17 ++++ python/paddle/fluid/optimizer.py | 43 ++++----- 3 files changed, 88 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bbae..dfa310a3863 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -28,55 +28,55 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } +// std::map visit; +// for (OpDesc *op : program.Block(0).AllOps()) { +// // For backward compatibility, some program doesn't have role added. +// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; +// int role_id = +// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); +// visit[role_id] = true; +// switch (role_id) { +// case _INT(OpRole::kForward): +// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { +// LOG(ERROR) +// << "Cannot add backward operator before forward operator %s." +// << op->Type(); +// } +// break; +// case _INT(OpRole::kBackward): +// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add backward operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | +// _INT(OpRole::kLoss)) == visit.end(), +// "Cannot add backward|loss operator before " +// "forward|loss operator %s.", +// op->Type()); +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add forward|loss operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kOptimize): +// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), +// "Optimize operators %s must follow backward operator.", +// op->Type()); +// break; +// case _INT(OpRole::kLRSched): +// case _INT(OpRole::kDist): +// case _INT(OpRole::kRPC): +// case _INT(OpRole::kNotSpecified): +// break; +// default: +// LOG(FATAL) << "Unknown operator role. Don't add new role because " +// "you don't know what you are doing."; +// } +// } #undef _INT } diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e..2205f473f23 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,6 +292,23 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); + + auto& dev = + *ctx.template device_context().eigen_device(); + + const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); + auto eigen_in_beta1_pow = + framework::EigenVector::Flatten(*beta1_pow_ptr); + auto eigen_out_beta1_pow = framework::EigenVector::Flatten( + *(const_cast(beta1_pow_ptr))); + eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; + + const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); + auto eigen_in_beta2_pow = + framework::EigenVector::Flatten(*beta2_pow_ptr); + auto eigen_out_beta2_pow = framework::EigenVector::Flatten( + *(const_cast(beta2_pow_ptr))); + eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d410..1930ac106b2 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,26 +739,27 @@ class AdamOptimizer(Optimizer): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param, grad in param_and_grads: - if grad is None: - continue - with param.block.program._optimized_guard( - [param, grad]), name_scope("optimizer"): - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param) - beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - param) - main_block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) - - main_block.append_op( - type="scale", - inputs={"X": beta2_pow_acc}, - outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}) + # for param, grad in param_and_grads: + + # if grad is None: + # continue + # with param.block.program._optimized_guard( + # [param, grad]), name_scope("optimizer"): + # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + # param) + # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + # param) + # main_block.append_op( + # type="scale", + # inputs={"X": beta1_pow_acc}, + # outputs={"Out": beta1_pow_acc}, + # attrs={"scale": self._beta1}) + + # main_block.append_op( + # type="scale", + # inputs={"X": beta2_pow_acc}, + # outputs={"Out": beta2_pow_acc}, + # attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): -- GitLab From 79082c94594adaf4765e950151da51c84ec137b8 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Dec 2018 17:31:52 +0800 Subject: [PATCH 0111/2367] fix pyreader failed --- .../scope_buffered_ssa_graph_executor.cc | 27 +++++++++---------- .../scope_buffered_ssa_graph_executor.h | 5 ++-- .../details/threaded_ssa_graph_executor.cc | 1 - paddle/fluid/framework/parallel_executor.cc | 22 +++++++++++---- .../fluid/operators/reader/buffered_reader.cc | 2 -- .../operators/reader/create_py_reader_op.cc | 2 -- .../fluid/operators/reader/open_files_op.cc | 2 -- 7 files changed, 31 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index abc6b9f559e..85898af417e 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,34 +27,31 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_infos_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_list_(std::move(var_infos_list)), + var_infos_(std::move(var_infos)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &scope = local_scopes_[i]; + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + auto &scope = *it; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - for (auto &var_infos : var_infos_list_) { - for (auto &info : var_infos) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); - } + for (auto &info : var_infos_) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 51230d4a42a..5e87e0bf50b 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,8 +38,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_info_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -54,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector> var_infos_list_; + std::vector var_infos_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a2937945..cebf63364da 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,7 +216,6 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } - VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 186f0cb8034..2a9ca3e815b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -141,7 +141,6 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -178,8 +177,8 @@ ParallelExecutor::ParallelExecutor( ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); - graphs[0]->SetNotOwned("garbage_collector", &gcs_); + graphs[i] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[i]->SetNotOwned("garbage_collector", &gcs_); } } } @@ -192,6 +191,18 @@ ParallelExecutor::ParallelExecutor( // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars + std::vector var_infos; + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + } + /** std::vector> var_infos_list; for (size_t i = 0; i < graphs.size(); ++i) { std::vector var_infos; @@ -203,8 +214,9 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = node->Var()->Persistable(); } } - var_infos_list.emplace_back(std::move(var_infos)); + var_infos_list.push_back(std::move(var_infos)); } + **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { @@ -236,7 +248,7 @@ ParallelExecutor::ParallelExecutor( } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos_list), + exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index cfa192f8e17..26ff221dfa0 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,9 +58,7 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); - VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 093b0e56b3d..901a92ab5b5 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,10 +28,8 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { - VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); - VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index ae37a187259..38223e06997 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,12 +115,10 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { - VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { - VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. -- GitLab From 8e3fe2d7355c09a3dde09bcbf63971ff3bfe169d Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 18:57:57 +0800 Subject: [PATCH 0112/2367] add skip op --- paddle/fluid/framework/async_executor.cc | 8 ++++++-- paddle/fluid/framework/executor_thread_worker.cc | 15 ++++++++++----- paddle/fluid/framework/executor_thread_worker.h | 2 ++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 7685883dd5e..f96ff436da9 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -95,8 +95,12 @@ void AsyncExecutor::InitParamConfig() { } } _param_config.slot_dim = _param_config.fea_dim - 2; //TODO - _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { + _param_config.skip_op.push_back(_pslib_ptr->get_param()->trainer_param().skip_op(t)); + } //sparse for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index e0ee9c11c90..d8320b422b8 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -340,16 +340,21 @@ void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptrType().find("sgd") != std::string::npos) { continue; } - if (op->Type().find("lookup_table") != std::string::npos || - op->Type().find("lookup_table_grad") != std::string::npos) { - continue; + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); } - op->Run(*thread_scope_, place_); } UpdateParams(); } diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 4e3255a590c..b3ee9dfaec9 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -39,6 +39,8 @@ struct AsyncWorkerParamConfig { int fea_dim; int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; std::map> dense_variable_name; std::map> dense_gradient_variable_name; -- GitLab From 067ed70f2def19263d33ef792323c439fa482484 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 10 Dec 2018 19:26:50 +0800 Subject: [PATCH 0113/2367] add HasProtoAttr function in op_desc.h, clean node.h test=develop --- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- .../fluid/framework/ir/is_test_pass_tester.cc | 4 ++-- .../framework/ir/mkldnn_placement_pass.cc | 15 +++++++----- paddle/fluid/framework/ir/node.cc | 22 ----------------- paddle/fluid/framework/ir/node.h | 12 ---------- paddle/fluid/framework/op_desc.cc | 24 ++++++++----------- paddle/fluid/framework/op_desc.h | 6 ++++- paddle/fluid/pybind/tensor_py.h | 2 +- 8 files changed, 28 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 6d8f020918d..57cc98e2ca0 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (n->RuntimeHasAttr("is_test")) { + if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index d9a68c7f1dd..9696441a216 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(node->RuntimeHasAttr("is_test")); + ASSERT_FALSE(op->HasAttr("is_test")); } else { - ASSERT_TRUE(node->RuntimeHasAttr("is_test")); + ASSERT_TRUE(op->HasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 9a9314161b0..951fcb066ce 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -25,12 +25,15 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( const auto& op_types_list = Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { - if (op_types_list.empty()) { - n->Op()->SetAttr("use_mkldnn", true); - } else if (std::find(op_types_list.begin(), op_types_list.end(), - n->Name()) != op_types_list.end()) { - n->Op()->SetAttr("use_mkldnn", true); + if (n->IsOp()) { + auto* op = n->Op(); + if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) { + if (op_types_list.empty()) { + op->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_mkldnn", true); + } } } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 7a88cb2b681..eac67108e21 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,28 +30,6 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } -bool Node::RuntimeHasAttr(const std::string &name) const { - if (Op()->HasAttr(name)) { - return true; - } else { - auto &op_info = OpInfoMap::Instance(); - auto op_type = Op()->Type(); - if (op_info.Has(op_type)) { - auto op_info_ptr = op_info.Get(op_type); - if (op_info_ptr.HasOpProtoAndChecker()) { - const proto::OpProto &proto = op_info_ptr.Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; - } - } - } - } - } - return false; -} - } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 1044a96430f..d2a393b3f19 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,18 +108,6 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } - // RuntimeHasAttr is different with HasAttr now. - // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, - // thus, if stored program_desc_ are old which don't have an attr, a new - // library which adds the attr already will fail on this function. - // Details: - // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 - // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above - // problem. - // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding - // RuntimeHasAttr. - bool RuntimeHasAttr(const std::string& name) const; - std::vector inputs; std::vector outputs; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index ce7ba967303..dde642764fa 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -237,20 +237,16 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } -bool OpDesc::HasAttr(const std::string &name) const { - if (attrs_.find(name) != attrs_.end()) { - return true; - } else { - auto &op_info = OpInfoMap::Instance(); - if (op_info.Has(desc_.type())) { - auto op_info_ptr = op_info.Get(desc_.type()); - if (op_info_ptr.HasOpProtoAndChecker()) { - const proto::OpProto &proto = op_info_ptr.Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; - } +bool OpDesc::HasProtoAttr(const std::string &name) const { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; } } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 3da7cdcef39..e8debec7f13 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -61,7 +61,11 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); - bool HasAttr(const std::string &name) const; + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + bool HasProtoAttr(const std::string &name) const; proto::AttrType GetAttrType(const std::string &name) const; diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 02a75236f6c..f67f40f19f6 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (int i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } -- GitLab From 33a004a779e8c4acb19ab13b641cc16d3827a582 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 20:36:49 +0800 Subject: [PATCH 0114/2367] fix numel nce and prefetch --- .../distributed/parameter_prefetch.cc | 10 +++++++-- paddle/fluid/operators/nce_op.h | 21 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 4cdeae81a10..aebf6376d16 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -114,9 +114,15 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope.FindVar(id_name)->Get(); + auto& id_tensor = scope->FindVar(id_name)->Get(); auto* out_tensor = - scope.FindVar(out_name)->GetMutable(); + scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the Tensor's numel must larger than zero. " + "Please check Tensor::Resize has been called first."); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 862064be182..99a3baba920 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -166,11 +166,12 @@ class NCEKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto &local_scope = context.scope().NewScope(); + framework::Scope &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids@Local"); + auto *ids = local_scope.Var("Ids@Prefetch"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +180,18 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local"); + std::vector w_dims = paddle::framework::vectorize2int( + context.Input("Weight")->dims()); + w_dims[0] = static_cast(labels.size()); + + auto *w_tensor = local_scope.Var("Weight@Prefetch") + ->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, - epmap, height_sections, context, - &local_scope); + operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", + table_names, epmap, height_sections, + context, local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " @@ -192,7 +199,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Prefetch")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); -- GitLab From 00776b167aa5bdfb8a9888f764ec8fa3a0b6f159 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 10 Dec 2018 20:49:41 +0800 Subject: [PATCH 0115/2367] fix memory opt skip set by name (#14774) * random failed. rerun ci. test=develop * windows failed. rerun ci. test=develop --- .../test_memory_optimization_transpiler.py | 37 ++++++++++++++----- .../memory_optimization_transpiler.py | 16 ++++++++ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index 275e5c49d5c..fa16f082880 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.transpiler import memory_optimize +def _get_vars(prog): + assert (isinstance(prog, Program)) + all_vars = set() + for op in prog.global_block().ops: + all_vars.update(op.input_arg_names) + all_vars.update(op.output_arg_names) + return all_vars + + class TestControlFlowGraph(unittest.TestCase): def setUp(self): program = Program() @@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase): self.program = program def test_control_flow_graph(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler2(unittest.TestCase): @@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase): avg_cost = layers.mean(cost) opt = optimizer.SGD(learning_rate=0.001) opt.minimize(avg_cost) + self.skip_set = set([cost.name, fc.name]) self.program = program def test_inplace_ops(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) + + def test_skip_opt(self): + result_program = self.program.clone() + memory_optimize(self.program, skip_opt_set=self.skip_set) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler3(unittest.TestCase): diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index c9f1be93477..95aafec0536 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,6 +14,7 @@ from __future__ import print_function +import six from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt @@ -470,8 +471,21 @@ def memory_optimize(input_program, Returns: None """ + + def to_name_str(var): + if isinstance(var, Variable): + return var.desc.name() + elif isinstance(var, str): + return var + elif isinstance(var, six.string_types): + return str(var) + else: + raise TypeError(str(var) + " should be Variable or str") + if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") + if skip_opt_set is not None and not isinstance(skip_opt_set, set): + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: @@ -486,6 +500,8 @@ def memory_optimize(input_program, skip_opt_set = grad_set else: skip_opt_set.update(grad_set) + if skip_opt_set is not None: + skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) -- GitLab From 8760d23c7dbcb4ad5a5b941aca5917514467c86d Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 10 Dec 2018 13:09:28 +0000 Subject: [PATCH 0116/2367] featue/py_func --- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/py_func_op.cc | 131 ++++++++++++++++++++++++++ paddle/fluid/operators/py_func_op.h | 25 +++++ paddle/fluid/pybind/pybind.cc | 21 +++++ python/paddle/fluid/layers/nn.py | 112 +++++++++++++++++++++- 5 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/py_func_op.cc create mode 100644 paddle/fluid/operators/py_func_op.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f9..9379122faf3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -82,7 +82,7 @@ endif() # op_library(unstack_op DEPS stack_op) # op_library(tensor_array_to_tensor_op DEPS concat_op) -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS} python pybind) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc new file mode 100644 index 00000000000..86914f30604 --- /dev/null +++ b/paddle/fluid/operators/py_func_op.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/py_func_op.h" +#include +#include +#include +#include "Python.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +namespace py = pybind11; + +static std::mutex g_py_callables_mtx; +static std::vector g_py_callables; + +size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { + std::lock_guard guard(g_py_callables_mtx); + g_py_callables.emplace_back(py_obj); + return g_py_callables.size() - 1; +} + +static py::object *GetPythonCallableObject(size_t i) { + std::lock_guard guard(g_py_callables_mtx); + PADDLE_ENFORCE_LT(i, g_py_callables.size()); + return &g_py_callables[i]; +} + +void DoCallPythonFunc(py::object *callable, const std::string &func_token, + const std::vector &ins, + std::vector *out) { + py::gil_scoped_acquire guard{}; + py::tuple in_args(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + in_args[i] = py::cast(ins[i]); + } + + auto ret = (*callable)(func_token, *in_args); + auto ret_tuple = py::cast(ret); + PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); + for (size_t i = 0; i < out->size(); ++i) { + try { + auto *out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(out_tensor, + "Output tensor should not be nullptr"); + (*out)[i]->set_lod(out_tensor->lod()); + (*out)[i]->ShareDataWith(*out_tensor); + } catch (py::cast_error &) { + PADDLE_THROW("Output %d is not LoDTensor", i); + } + } +} + +class PyFuncOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + } +}; + +class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Inputs of py_func op.").AsDuplicable(); + AddOutput("Out", "Outputs of py_func op").AsDuplicable(); + AddAttr("token", "function token"); + AddAttr("handle_idx", "handle index").SetDefault(0); + AddComment(R"DOC("PyFunc Op")DOC"); + } +}; + +class PyFuncOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + protected: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &in_arg_names = Inputs("X"); + auto &out_arg_names = Outputs("Out"); + + std::vector inputs(in_arg_names.size()); + for (size_t i = 0; i < in_arg_names.size(); ++i) { + auto &in_tensor = + scope.FindVar(in_arg_names[i])->Get(); + if (platform::is_gpu_place(in_tensor.place())) { + framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]); + } else { + inputs[i].ShareDataWith(in_tensor); + } + inputs[i].set_lod(in_tensor.lod()); + } + + std::vector outputs(out_arg_names.size()); + for (size_t i = 0; i < out_arg_names.size(); ++i) { + auto *out_tensor = + scope.FindVar(out_arg_names[i])->GetMutable(); + outputs[i] = out_tensor; + } + + auto &token = Attr("token"); + auto handle_idx = static_cast(Attr("handle_idx")); + auto *py_callable = GetPythonCallableObject(handle_idx); + VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " + << handle_idx; + DoCallPythonFunc(py_callable, token, inputs, &outputs); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, + ops::PyFuncOpShapeInference, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h new file mode 100644 index 00000000000..e85fa6b5bc3 --- /dev/null +++ b/paddle/fluid/operators/py_func_op.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace operators { + +size_t AppendPythonCallableObjectAndReturnId(pybind11::object py_obj); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b23..58da2cea347 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" @@ -100,6 +101,12 @@ PYBIND11_MODULE(core, m) { BindException(&m); + m.def( + "append_python_callable_object_and_return_id", + [](py::object py_obj) -> size_t { + return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); + }); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -525,6 +532,20 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Place") .def(py::init<>()) + .def("is_cpu_place", + [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("is_cuda_pinned_place", + [](platform::Place &self) { + return platform::is_cuda_pinned_place(self); + }) + .def("gpu_device_id", + [](platform::Place &self) { + PADDLE_ENFORCE(platform::is_gpu_place(self), + "gpu_device_id() only supports in CUDAPlace"); + return boost::get(self).device; + }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4833212d311..92cd53a6c36 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -18,10 +18,12 @@ All layers just related to the neural network. from __future__ import print_function import numpy as np +import six import os +import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant -from ..framework import Variable, OpProtoHolder +from ..framework import Variable, OpProtoHolder, Program from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat @@ -172,6 +174,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'py_func', ] kIgnoreIndex = -100 @@ -9082,3 +9085,110 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +@templatedoc() +def py_func(func, x, out, backward_func=None): + """ + """ + + class PyFuncRegister(object): + _main_program_to_register = dict() + + @classmethod + def get_instance(cls, prog=None): + if prog is None: + prog = fluid.default_main_program() + + if not isinstance(prog, Program): + raise ValueError("prog must be None or type of Program") + + ret = cls._main_program_to_register.get(prog, None) + if ret is None: + ret = PyFuncRegister() + ret._idx = core.append_python_callable_object_and_return_id(ret) + ret._token_func_dict = dict() + ret._func_token_dict = dict() + cls._main_program_to_register[prog] = ret + + return ret + + @property + def handle_idx(self): + return self._idx + + def unique_token(self, func): + return self._register_func(func) + + def _register_func(self, func): + if func is None: + raise ValueError("func cannot be None") + + token = self._func_token_dict.get(func, None) + if token is not None: + return token + + token = unique_name.generate('py_func_op_token') + self._token_func_dict[token] = func + self._func_token_dict[func] = token + return token + + def __call__(self, token, *args): + func = self._token_func_dict.get(token, None) + if func is None: + raise ValueError("func has not been registered") + + arg_list = inspect.getargspec(func) + kwargs = dict() + idx = 0 + for arg in arg_list[0]: + kwargs[arg] = args[idx] + idx += 1 + + args = args[idx:] + ret0 = func(*args, **kwargs) + if ret0 is None: + return None + + if not isinstance(ret0, (list, tuple)): + ret0 = (ret0, ) + + ret = [] + for i in six.moves.range(len(ret0)): + if isinstance(ret0[i], core.LoDTensor): + ret.append(ret0[i]) + continue + + if isinstance(ret0[i], np.ndarray): + r = ret0[i] + else: + r = np.array(ret0[i]) + + t = core.LoDTensor() + t.set(r, core.CPUPlace()) + ret.append(t) + + return tuple(ret) + + helper = LayerHelper('py_func', **locals()) + if isinstance(x, Variable): + x = [x] + + if isinstance(out, Variable): + out = [out] + + for each_out in out: + if len(each_out.shape) == 0: + raise ValueError( + 'users should infer shapes of outputs of py_func op manually') + + py_func_reg = PyFuncRegister.get_instance(helper.main_program) + token = py_func_reg.unique_token(func) + + helper.append_op( + type='py_func', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'handle_idx': py_func_reg.handle_idx, + 'token': token}) + return out -- GitLab From 016a06877578a6c862d5fd7eef3c1c75a71adc81 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 21:34:37 +0800 Subject: [PATCH 0117/2367] stop server --- paddle/fluid/framework/async_executor.cc | 4 ++++ paddle/fluid/framework/async_executor.h | 1 + .../fluid/framework/executor_thread_worker.cc | 18 +++++++++--------- paddle/fluid/pybind/async_executor_py.cc | 1 + python/paddle/fluid/async_executor.py | 9 ++++++++- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index f96ff436da9..45a914b70ea 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -83,6 +83,10 @@ uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } +void AsyncExecutor::StopServer() { + _pslib_ptr->stop_server(); +} + void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 90d6b46b2f9..4b461262173 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -67,6 +67,7 @@ class AsyncExecutor { void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); //void ConfigWorker() {} uint64_t StartServer(); + void StopServer(); void GatherServers(std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index d8320b422b8..a0455b26efd 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -569,7 +569,6 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; //TODO auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO auto& features = _features[table_id]; @@ -592,19 +591,20 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { } Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); LoDTensor* g_tensor = g_var->GetMutable(); - //int count = g_tensor->numel(); - float* g = g_tensor->data(); - /* - if (FLAGS_scale_sparse_gradient_with_batch_size) { - Eigen::Map g_mat(g, 1, tensor->numel()); - g_mat *= _batch_size; + if (g_tensor == NULL) { + LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + exit(-1); } - */ + float* g = g_tensor->data(); Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } int len = tensor->lod()[0].back(); - //assert(slot_dim * len == count); + assert(slot_dim * len == g_tensor->numel()); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx){ if (ids[id_idx] == 0) { diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index eca46fbad55..8dfba0d2694 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -51,6 +51,7 @@ void BindAsyncExecutor(py::module* m) { .def("init_server", &framework::AsyncExecutor::InitServer) .def("init_worker", &framework::AsyncExecutor::InitWorker) .def("start_server", &framework::AsyncExecutor::StartServer) + .def("stop_server", &framework::AsyncExecutor::StopServer) .def("gather_servers", &framework::AsyncExecutor::GatherServers) .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 3451d1edb54..76fdb5b0e26 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -151,7 +151,10 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - self.instance.barrier_all() + self.instance.barrier_all() #worker do all things + if self.instance.is_first_worker(): + self.executor.stop_server() + self.instance.barrier_all() #sync def config_distributed_nodes(self, dist_opt): @@ -164,6 +167,9 @@ class AsyncExecutor(object): def get_instance(self): return self.instance + #def stop_server(self): + # self.executor.stop_server() + def init_server(self, dist_desc): self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() @@ -174,6 +180,7 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model self.instance.barrier_all() #wait worker do all things + self.instance.barrier_all() #sync def init_worker(self, dist_desc): self.instance.barrier_all() #wait all server start -- GitLab From 1735022a1bdaa5777009bf537dd6c09be17e33fc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 11 Dec 2018 09:36:07 +0800 Subject: [PATCH 0118/2367] fix clang test=develop --- paddle/fluid/framework/op_registry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 36673e48c20..6d39bb3c524 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -319,7 +319,7 @@ struct OpKernelRegistrarFunctorEx Date: Tue, 11 Dec 2018 10:41:18 +0800 Subject: [PATCH 0119/2367] fix numel nce and prefetch test=develop --- paddle/fluid/operators/nce_op.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 99a3baba920..2c97eef096e 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -49,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, auto label = context.Input("Label"); const int64_t *label_data = label->data(); auto label_dims = label->dims(); - // int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); -- GitLab From 0a7c7e97afc20d26406de27a22c9cd4b7edad8b0 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 11 Dec 2018 10:45:16 +0800 Subject: [PATCH 0120/2367] test zero output of split_selected_rows_op test=develop --- paddle/fluid/pybind/pybind.cc | 2 ++ .../fluid/tests/unittests/test_split_selected_rows_op.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b23..9d92529e4ee 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -298,6 +298,8 @@ PYBIND11_MODULE(core, m) { .def("get_tensor", [](SelectedRows &self) { return self.mutable_value(); }, py::return_value_policy::reference) + .def("numel", + [](SelectedRows &self) -> int64_t { return self.value().numel(); }) .def("set_height", &SelectedRows::set_height) .def("height", &SelectedRows::height) .def("set_rows", diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py index 50204b8a77c..f8847e1570d 100644 --- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py @@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase): # expected output selected rows expected_out0_rows = [0, 4] expected_out1_rows = [0, 2] + expected_out2_rows = [] expected_out4_rows = [0] op = Operator( @@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase): self.assertEqual(outs[0].rows(), expected_out0_rows) self.assertEqual(outs[1].rows(), expected_out1_rows) + self.assertEqual(outs[2].rows(), expected_out2_rows) self.assertEqual(outs[4].rows(), expected_out4_rows) self.assertEqual(outs[0].height(), height_sections[0]) @@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase): self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1]) self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1]) + self.assertEqual(outs[2].numel(), 0) + self.assertEqual(outs[3].numel(), 0) + def check_grad_with_place(self, place): scope = core.Scope() height = 10 -- GitLab From 60d71a9e2987941487f7f1e44d1e1850b41a1e3d Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 11:12:49 +0800 Subject: [PATCH 0121/2367] skip op py file --- python/paddle/fluid/distributed/downpour.py | 1 + python/paddle/fluid/distributed/node.py | 2 +- python/paddle/fluid/distributed/ps_pb2.py | 93 +++++++++++---------- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 654fa6fab6f..c1762dd7688 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -66,5 +66,6 @@ class DownpourSGD(object): # Todo(guru4elephant): figure out how to support more sparse parameters # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + ps_param.trainer_param.skip_op.extend(worker_skipped_ops) ps_param_str = text_format.MessageToString(ps_param) return [ps_param, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index c245dc4db8d..1f4aeeac738 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -60,7 +60,7 @@ class DownpourServer(Server): table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 - for param in param_var: + for param in filter(lambda x: x.name.find("embedding") == -1, param_var): fea_dim += reduce(lambda x, y: x * y, param.shape, 1) table.accessor.fea_dim = fea_dim diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index b82c649e143..978b18d0d5e 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd1\x01\n\x16ServerServiceParameter\x12(\n\x0cserver_class\x18\x01 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsServer\x12(\n\x0c\x63lient_class\x18\x02 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsClient\x12&\n\rservice_class\x18\x03 \x01(\t:\x0f\x41\x62\x61\x63usPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3262, - serialized_end=3314, + serialized_start=3286, + serialized_end=3338, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3317, - serialized_end=3634, + serialized_start=3341, + serialized_end=3658, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3230, - serialized_end=3260, + serialized_start=3254, + serialized_end=3284, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -342,7 +342,7 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( - name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, + name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, @@ -355,6 +355,13 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), ], extensions=[ ], @@ -368,7 +375,7 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=557, - serialized_end=745, + serialized_end=763, ) @@ -419,8 +426,8 @@ _DENSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=747, - serialized_end=870, + serialized_start=765, + serialized_end=888, ) @@ -478,8 +485,8 @@ _SPARSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=872, - serialized_end=994, + serialized_start=890, + serialized_end=1012, ) @@ -516,8 +523,8 @@ _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=997, - serialized_end=1131, + serialized_start=1015, + serialized_end=1149, ) @@ -575,8 +582,8 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1134, - serialized_end=1343, + serialized_start=1152, + serialized_end=1367, ) @@ -641,8 +648,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1346, - serialized_end=1537, + serialized_start=1370, + serialized_end=1561, ) @@ -721,8 +728,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1540, - serialized_end=1909, + serialized_start=1564, + serialized_end=1933, ) @@ -794,8 +801,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1912, - serialized_end=2118, + serialized_start=1936, + serialized_end=2142, ) @@ -839,8 +846,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2120, - serialized_end=2203, + serialized_start=2144, + serialized_end=2227, ) @@ -898,8 +905,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2205, - serialized_end=2306, + serialized_start=2229, + serialized_end=2330, ) @@ -950,8 +957,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2308, - serialized_end=2427, + serialized_start=2332, + serialized_end=2451, ) @@ -1009,8 +1016,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2430, - serialized_end=2655, + serialized_start=2454, + serialized_end=2679, ) @@ -1068,8 +1075,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2658, - serialized_end=2792, + serialized_start=2682, + serialized_end=2816, ) @@ -1106,8 +1113,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2794, - serialized_end=2860, + serialized_start=2818, + serialized_end=2884, ) @@ -1137,8 +1144,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2862, - serialized_end=2921, + serialized_start=2886, + serialized_end=2945, ) @@ -1168,8 +1175,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2923, - serialized_end=2969, + serialized_start=2947, + serialized_end=2993, ) @@ -1213,8 +1220,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2971, - serialized_end=3044, + serialized_start=2995, + serialized_end=3068, ) @@ -1287,8 +1294,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=3047, - serialized_end=3260, + serialized_start=3071, + serialized_end=3284, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER -- GitLab From 644c13a3874b565b7bcda9beab9e9bb271e4ba3e Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 11 Dec 2018 03:38:58 +0000 Subject: [PATCH 0122/2367] fix compile error --- paddle/fluid/inference/tests/api/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 6901aac3c34..8a4bc04b678 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,8 +1,7 @@ -set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark) if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} -ir_pass_manager analysis_predictor benchmark) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) endif() function(download_model install_dir model_name) -- GitLab From 7604b1ad519b9581995e4d5aed148f2893c70702 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 13:14:50 +0800 Subject: [PATCH 0123/2367] Fix Eigen macro when using GPU The macro should be defined by compiler rather than by source. test=develop --- cmake/configure.cmake | 1 + paddle/fluid/operators/bilinear_tensor_product_op.cu | 1 - paddle/fluid/operators/cos_sim_op.cu | 2 -- paddle/fluid/operators/crop_op.cu | 2 -- paddle/fluid/operators/dropout_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_add_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_div_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_max_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_min_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_mul_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_pow_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_sub_op.cu | 2 -- paddle/fluid/operators/expand_op.cu | 3 --- paddle/fluid/operators/gru_unit_op.cu | 2 -- paddle/fluid/operators/hinge_loss_op.cu | 2 -- paddle/fluid/operators/huber_loss_op.cu | 2 -- paddle/fluid/operators/im2sequence_op.cu | 2 -- paddle/fluid/operators/isfinite_op.cu | 2 -- paddle/fluid/operators/l1_norm_op.cu | 2 -- paddle/fluid/operators/log_loss_op.cu | 2 -- paddle/fluid/operators/math/context_project.cu | 3 --- paddle/fluid/operators/math/math_function.cu | 2 -- paddle/fluid/operators/math/sequence2batch.cu | 2 -- paddle/fluid/operators/math/softmax.cu | 3 --- paddle/fluid/operators/mean_op.cu | 3 --- paddle/fluid/operators/optimizers/adadelta_op.cu | 2 -- paddle/fluid/operators/optimizers/adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/adam_op.cu | 2 -- paddle/fluid/operators/optimizers/adamax_op.cu | 2 -- paddle/fluid/operators/optimizers/decayed_adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/ftrl_op.cu | 2 -- paddle/fluid/operators/optimizers/proximal_adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/proximal_gd_op.cu | 2 -- paddle/fluid/operators/optimizers/rmsprop_op.cu | 2 -- paddle/fluid/operators/pad_constant_like_op.cu | 2 -- paddle/fluid/operators/pad_op.cu | 2 -- paddle/fluid/operators/sequence_ops/sequence_pool_op.cu | 3 --- paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu | 2 -- paddle/fluid/operators/smooth_l1_loss_op.cu | 3 --- paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 --- paddle/fluid/operators/squared_l2_distance_op.cu | 3 --- paddle/fluid/operators/squared_l2_norm_op.cu | 2 -- paddle/fluid/operators/sum_op.cu | 2 -- paddle/fluid/platform/device_context.h | 1 - paddle/fluid/platform/float16.h | 3 --- 45 files changed, 1 insertion(+), 95 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4e17ddee739..51f7a61631d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG) if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) + add_definitions(-DEIGEN_USE_GPU) FIND_PACKAGE(CUDA REQUIRED) diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu index 9426ffbe174..c2b4f69e685 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cu +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/fluid/operators/bilinear_tensor_product_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu index 82205e9c754..3d144ca29d9 100644 --- a/paddle/fluid/operators/cos_sim_op.cu +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/cos_sim_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu index b75678217e3..66cb5c452de 100644 --- a/paddle/fluid/operators/crop_op.cu +++ b/paddle/fluid/operators/crop_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/crop_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index e011f47e086..d65491267de 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include #include #include diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2fb7eeb4b9e..fed12785f47 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index c5a1a7e08d8..1a149298fd3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index a90dcd3ecf0..5d086a1b29f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_max_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index ab77709c28c..cf93e5a97a3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 4d16bc38e1d..833c4072826 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu index 6ee0779f23b..9263dbfebfd 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu @@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 8d9bf7c4d81..6f17d3292f3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu index 60363bfc86d..d95c9b61802 100644 --- a/paddle/fluid/operators/expand_op.cu +++ b/paddle/fluid/operators/expand_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/expand_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu index fc92b3d4a7a..37689901ecb 100644 --- a/paddle/fluid/operators/gru_unit_op.cu +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/gru_unit_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu index 9c0a85bee6e..b5ea0a702e0 100644 --- a/paddle/fluid/operators/hinge_loss_op.cu +++ b/paddle/fluid/operators/hinge_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/hinge_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu index 659464df9dc..09c743c4275 100644 --- a/paddle/fluid/operators/huber_loss_op.cu +++ b/paddle/fluid/operators/huber_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/huber_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu index e0a5a90c1c3..1c34640618d 100644 --- a/paddle/fluid/operators/im2sequence_op.cu +++ b/paddle/fluid/operators/im2sequence_op.cu @@ -11,8 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/im2sequence_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu index 8d1268b18c6..995969cd42f 100644 --- a/paddle/fluid/operators/isfinite_op.cu +++ b/paddle/fluid/operators/isfinite_op.cu @@ -11,8 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/isfinite_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu index 1b48571dd73..a5c29bbf5de 100644 --- a/paddle/fluid/operators/l1_norm_op.cu +++ b/paddle/fluid/operators/l1_norm_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/l1_norm_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu index e8bf7d8159b..280913c43a2 100644 --- a/paddle/fluid/operators/log_loss_op.cu +++ b/paddle/fluid/operators/log_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/log_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu index 16205c0e145..f04b2d15349 100644 --- a/paddle/fluid/operators/math/context_project.cu +++ b/paddle/fluid/operators/math/context_project.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/math/context_project.h" namespace paddle { diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 79b7538ad05..9372d63f0be 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/blas.h" diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu index be73adfc0cb..9ab13659c1c 100644 --- a/paddle/fluid/operators/math/sequence2batch.cu +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 2e9669049e3..71d13739826 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 413b8ace67b..921c2e12989 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu index 3fbfee5df05..562a157f063 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cu +++ b/paddle/fluid/operators/optimizers/adadelta_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adadelta_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 4efe56855a4..5043468d4c5 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/optimizers/adagrad_op.h" diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index e8090ebacfe..4eb2db717d4 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adam_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu index e54adcb142f..80e0219d441 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cu +++ b/paddle/fluid/operators/optimizers/adamax_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adamax_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu index 84d65e39329..dc568802a2b 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu index f836b75df93..acf8e38ca0f 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cu +++ b/paddle/fluid/operators/optimizers/ftrl_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/ftrl_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu index d1c1f747b70..591dead3b12 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu index 7aa0e101500..d556fa74f19 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/proximal_gd_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu index 69e35a309e0..8b17d6a0204 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.cu +++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/rmsprop_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu index ea695779045..9e62a6dc9d3 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cu +++ b/paddle/fluid/operators/pad_constant_like_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/pad_constant_like_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu index 9cddef9cf1d..95098a8dca3 100644 --- a/paddle/fluid/operators/pad_op.cu +++ b/paddle/fluid/operators/pad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/pad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu index 63cd47a38a0..4897474a485 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 9aadac1a416..a1fbc7e5fab 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu index dfbb5c90588..e5df479090f 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cu +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 6d48796191d..cee3e87037e 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu index 3e80ae8dd22..c9264da8382 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.cu +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu index 87830413da3..e31cfeb78ab 100644 --- a/paddle/fluid/operators/squared_l2_norm_op.cu +++ b/paddle/fluid/operators/squared_l2_norm_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/squared_l2_norm_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index db4c2d6c115..6125ed07b6d 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3edd7279780..ce1494f1702 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" -#define EIGEN_USE_GPU #endif #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 9d48557caf7..98afe843c00 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -71,9 +71,6 @@ struct float16; } // namespace platform } // namespace paddle -// NOTE(): -// Do not move the eigen.h header, otherwise the eigen_vector will failed. -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/hostdevice.h" #include "unsupported/Eigen/CXX11/Tensor" -- GitLab From eab47459658b10ec799a5dccbfd9bf8f45b9771a Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 26 Nov 2018 15:53:33 +0800 Subject: [PATCH 0124/2367] add adaptive mode for pool. --- paddle/fluid/operators/math/pooling.cc | 202 ++++++++++++++++++------- paddle/fluid/operators/math/pooling.cu | 26 ++-- 2 files changed, 161 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 8df43bb6161..68fed9fd4eb 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -19,6 +19,16 @@ namespace paddle { namespace operators { namespace math { +static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + /* * All tensors are in NCHW format. * Ksize, strides, paddings are two elements. These two elements represent @@ -31,7 +41,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -54,13 +64,23 @@ class Pool2dFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { @@ -68,8 +88,9 @@ class Pool2dFunctor { pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } @@ -94,7 +115,7 @@ class Pool2dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -118,15 +139,26 @@ class Pool2dGradFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -251,7 +283,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -279,17 +311,32 @@ class Pool3dFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T ele = pool_process.initial(); for (int d = dstart; d < dend; ++d) { @@ -302,7 +349,7 @@ class Pool3dFunctor { } } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -330,7 +377,7 @@ class Pool3dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -359,21 +406,35 @@ class Pool3dGradFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; float scale = 1.0 / pool_size; @@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -541,13 +602,23 @@ class MaxPool2dWithIndexFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T1 ele = static_cast(-FLT_MAX); int index = -1; @@ -666,17 +737,32 @@ class MaxPool3dWithIndexFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T1 ele = static_cast(-FLT_MAX); diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index cdc79e207aa..06e92665c70 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* output_data) { + bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -37,13 +37,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T ele = pool_process.initial(); @@ -52,8 +60,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } -- GitLab From 266c6856c90836296f908afa5fff3e08b3ebb718 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 28 Nov 2018 22:09:23 +0800 Subject: [PATCH 0125/2367] add adaptive pool 2d & 3d. test=develop --- paddle/fluid/API.spec | 2 + paddle/fluid/operators/math/pooling.cc | 143 +++--- paddle/fluid/operators/math/pooling.cu | 411 +++++++++++------- paddle/fluid/operators/math/pooling.h | 20 +- paddle/fluid/operators/pool_op.cc | 26 +- paddle/fluid/operators/pool_op.h | 16 +- paddle/fluid/operators/pool_with_index_op.cc | 27 +- paddle/fluid/operators/pool_with_index_op.h | 12 +- paddle/fluid/operators/spp_op.h | 6 +- python/paddle/fluid/layers/nn.py | 186 ++++++++ .../fluid/tests/unittests/test_layers.py | 22 + .../fluid/tests/unittests/test_pool2d_op.py | 91 ++-- .../fluid/tests/unittests/test_pool3d_op.py | 121 ++++-- .../fluid/tests/unittests/test_pool_max_op.py | 95 +++- 14 files changed, 860 insertions(+), 318 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fd4cf92d85d..87ed586aad9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 68fed9fd4eb..b4ee82add31 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -61,24 +61,26 @@ class Pool2dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -136,24 +138,26 @@ class Pool2dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } int pool_size = (exclusive || adaptive) @@ -308,33 +312,36 @@ class Pool3dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } int output_idx = (pd * output_height + ph) * output_width + pw; @@ -403,33 +410,36 @@ class Pool3dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -599,24 +609,26 @@ class MaxPool2dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -655,7 +667,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; @@ -708,8 +720,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -734,33 +746,36 @@ class MaxPool3dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -804,7 +819,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 06e92665c70..5f3b82ed553 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -21,6 +21,18 @@ namespace paddle { namespace operators { namespace math { +__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size, + int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size, + int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + template __global__ void KernelPool2D(const int nthreads, const T* input_data, const int channels, const int input_height, @@ -37,19 +49,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; + int hstart, hend; + int wstart, wend; if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); hstart = max(hstart, 0); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); wstart = max(wstart, 0); } @@ -74,7 +88,7 @@ __global__ void KernelPool2DGrad( const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* input_grad) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int offsetW = index % input_width + padding_width; @@ -82,14 +96,24 @@ __global__ void KernelPool2DGrad( int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int phend = min(offsetH / stride_height + 1, output_height); - int pwend = min(offsetW / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + phend = min(offsetH / stride_height + 1, output_height); + pwend = min(offsetW / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; int output_idx = @@ -98,14 +122,22 @@ __global__ void KernelPool2DGrad( output_grad += output_idx; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = static_cast(ceil(static_cast(input_height) / + ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = exclusive ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; + } int output_sub_idx = ph * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -189,7 +221,7 @@ void Pool2dDirectCUDAFunctor::operator()( KernelPool2D<<>>( nthreads, input, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_compute, exclusive, output); + padding_height, padding_width, pool_compute, exclusive, false, output); } /* @@ -204,7 +236,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -231,7 +263,7 @@ class Pool2dFunctor { nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -250,7 +282,8 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -278,7 +311,7 @@ class Pool2dGradFunctor { nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, - pool_process, exclusive, input_grad_data); + pool_process, exclusive, adaptive, input_grad_data); } }; @@ -367,7 +400,7 @@ __global__ void KernelPool3D( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* output_data) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -376,15 +409,30 @@ __global__ void KernelPool3D( int c = (index / output_width / output_height / output_depth) % channels; int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); + + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T ele = pool_process.initial(); input_data += (batch_idx * channels + c) * input_depth * input_height * input_width; @@ -396,7 +444,7 @@ __global__ void KernelPool3D( } } } - int pool_size = exclusive + int pool_size = (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -413,7 +461,7 @@ __global__ void KernelPool3DGrad( const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* input_grad) { + bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int offsetW = index % input_width + padding_width; @@ -423,18 +471,31 @@ __global__ void KernelPool3DGrad( int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pdstart = (offsetD < ksize_depth) - ? 0 - : (offsetD - ksize_depth) / stride_depth + 1; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int pdend = min((offsetD) / stride_depth + 1, output_depth); - int phend = min((offsetH) / stride_height + 1, output_height); - int pwend = min((offsetW) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = offsetD * output_depth / input_depth; + pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = (offsetD < ksize_depth) + ? 0 + : (offsetD - ksize_depth) / stride_depth + 1; + phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + pdend = min((offsetD) / stride_depth + 1, output_depth); + phend = min((offsetH) / stride_height + 1, output_height); + pwend = min((offsetW) / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; @@ -447,18 +508,29 @@ __global__ void KernelPool3DGrad( for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = - exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = + static_cast( + ceil(static_cast(input_depth) / ksize_depth)) * + static_cast( + ceil(static_cast(input_height) / ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = + exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) + : ksize_depth * ksize_height * ksize_width; + } int output_sub_idx = (pd * output_height + ph) * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -533,7 +605,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -567,7 +639,7 @@ class Pool3dFunctor { input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -586,7 +658,8 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -622,7 +695,7 @@ class Pool3dGradFunctor { input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, input_grad_data); + padding_width, pool_process, exclusive, adaptive, input_grad_data); } }; @@ -711,7 +784,7 @@ __global__ void KernelMaxPool2dWithIdx( const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, - const int padding_width, T1* output_data, T2* mask_data) { + const int padding_width, bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -719,13 +792,23 @@ __global__ void KernelMaxPool2dWithIdx( int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); + int hstart, hend; + int wstart, wend; + if (adaptive) { + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T1 ele = -FLT_MAX; @@ -750,36 +833,46 @@ __global__ void KernelMaxPool2DWithIdxGrad( const int channels, const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T1* input_grad) { + const int padding_height, const int padding_width, bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int w_offset = index % input_width; - int h_offset = (index / input_width) % input_height; - int c_offset = (index / input_width / input_height) % channels; + int offsetW = index % input_width; + int offsetH = (index / input_width) % input_height; + int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + phstart = + (offsetH + padding_height < ksize_height) + ? 0 + : (offsetH + padding_height - ksize_height) / stride_height + 1; + pwstart = + (offsetW + padding_width < ksize_width) + ? 0 + : (offsetW + padding_width - ksize_width) / stride_width + 1; + phend = + min((offsetH + padding_height) / stride_height + 1, output_height); + pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; - int input_current_featuremap_idx = h_offset * input_width + w_offset; + int input_current_featuremap_idx = offsetH * input_width + offsetW; int output_idx = - (batch_idx * channels + c_offset) * output_height * output_width; + (batch_idx * channels + offsetC) * output_height * output_width; mask_data += output_idx; output_grad += output_idx; - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask_data[ph * output_width + pw] == input_current_featuremap_idx) gradient += output_grad[ph * output_width + pw]; } @@ -799,8 +892,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -827,7 +920,8 @@ class MaxPool2dWithIndexFunctor { KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, output_data, mask_data); + stride_width, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -843,7 +937,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -870,7 +964,7 @@ class MaxPool2dWithIndexGradFunctor { KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, + stride_height, stride_width, padding_height, padding_width, adaptive, input_grad_data); } }; @@ -892,7 +986,7 @@ __global__ void KernelMaxPool3DWithIdx( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - T1* output_data, T2* mask_data) { + bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -902,15 +996,29 @@ __global__ void KernelMaxPool3DWithIdx( int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); + + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T1 ele = -FLT_MAX; int max_index = -1; @@ -940,46 +1048,56 @@ __global__ void KernelMaxPool3DWithIdxGrad( const int output_width, const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, T1* input_grad) { + const int padding_width, bool adaptive, T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int w_offset = index % input_width; - int h_offset = (index / input_width) % input_height; - int d_offset = (index / input_width / input_height) % input_depth; - int c_offset = - (index / input_width / input_height / input_depth) % channels; + int offsetW = index % input_width; + int offsetH = (index / input_width) % input_height; + int offsetD = (index / input_width / input_height) % input_depth; + int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pd_start = - (d_offset + padding_depth < ksize_depth) - ? 0 - : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int pd_end = - min((d_offset + padding_depth) / stride_depth + 1, output_depth); - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = offsetD * output_depth / input_depth; + pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = + (offsetD + padding_depth < ksize_depth) + ? 0 + : (offsetD + padding_depth - ksize_depth) / stride_depth + 1; + phstart = + (offsetH + padding_height < ksize_height) + ? 0 + : (offsetH + padding_height - ksize_height) / stride_height + 1; + pwstart = + (offsetW + padding_width < ksize_width) + ? 0 + : (offsetW + padding_width - ksize_width) / stride_width + 1; + pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth); + phend = + min((offsetH + padding_height) / stride_height + 1, output_height); + pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; int input_current_feature_map_idx = - (d_offset * input_height + h_offset) * input_width + w_offset; - int output_idx = (batch_idx * channels + c_offset) * output_depth * + (offsetD * input_height + offsetH) * input_width + offsetW; + int output_idx = (batch_idx * channels + offsetC) * output_depth * output_height * output_width; mask += output_idx; output_grad += output_idx; - for (int pd = pd_start; pd < pd_end; ++pd) { - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask[(pd * output_height + ph) * output_width + pw] == input_current_feature_map_idx) gradient += @@ -1002,8 +1120,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1037,7 +1155,8 @@ class MaxPool3dWithIndexFunctor { nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, output_data, mask_data); + padding_depth, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -1053,7 +1172,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -1087,7 +1206,7 @@ class MaxPool3dWithIndexGradFunctor { nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, + stride_width, padding_depth, padding_height, padding_width, adaptive, input_grad_data); } }; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 923babd4c24..d123af8924b 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -102,7 +102,7 @@ class Pool2dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -114,7 +114,7 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -136,7 +136,7 @@ class Pool3dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -148,7 +148,7 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -176,8 +176,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -187,7 +187,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; @@ -197,8 +197,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -208,7 +208,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 52b607df744..11b5c493230 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); bool ceil_mode = ctx->Attrs().Get("ceil_mode"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i], ceil_mode)); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(PoolOutputSize( + in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode)); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->ShareLoD("X", "Out"); @@ -186,6 +191,14 @@ void Pool2dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); + AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") @@ -325,6 +338,13 @@ void Pool3dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr( "use_cudnn", diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index c0594b7e3cc..6c5900bd0f5 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::MaxPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< @@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::AvgPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; case 3: { @@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel { pool3d_forward; paddle::operators::math::MaxPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { @@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel { pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; case 3: { @@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel { pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index 873706593e4..f9e25277e5c 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -60,9 +61,13 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Mask", framework::make_ddim(output_shape)); @@ -133,6 +138,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default:false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1, 1}), strides(height, " "width) of pooling operator.") @@ -209,6 +222,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1,1,1}), strides(depth, " "height, width) of pooling operator.") diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index b55fa76eae3..a6bec121d4f 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { @@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } @@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { T1, T2> pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index 35d9737ee01..3c2d51ec911 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel { math::Pool2dFunctor, T> pool_forward; math::MaxPool max_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, max_process, true, + kernel_size, strides, paddings, max_process, true, false, &out_level); } else if (pooling_type == "avg") { math::Pool2dFunctor, T> pool_forward; math::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, avg_process, true, + kernel_size, strides, paddings, avg_process, true, false, &out_level); } // flatten pooling output shape @@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel { math::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, - paddings, avg_process, true, in_x_grad); + paddings, avg_process, true, false, in_x_grad); } } } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fda..61794f0d49a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -52,6 +52,8 @@ __all__ = [ 'softmax', 'pool2d', 'pool3d', + 'adaptive_pool2d', + 'adaptive_pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', @@ -2499,6 +2501,190 @@ def pool3d(input, return pool_out +@templatedoc(op_type="pool2d") +def adaptive_pool2d(input, + pool_size, + pool_type="max", + require_index=False, + use_cudnn=True, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (pool_size_Height, pool_size_Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + use_cudnn (bool): ${use_cudnn_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: 'use_cudnn' is not a bool value. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.pool2d( + input=data, + pool_size=[3, 3], + pool_type='max', + require_index=True) + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2: + raise ValueError( + "'pool_size' should be a list or tuple with length as 2.") + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False.") + + if pool_type == "max": + l_type = 'max_pool2d_with_index' + else: + l_type = "pool2d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "use_cudnn": use_cudnn, + "adaptive": True, + }) + + return pool_out + + +@templatedoc(op_type="pool3d") +def adaptive_pool3d(input, + pool_size, + pool_type="max", + require_index=False, + use_cudnn=True, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (Depth, Height, Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + use_cudnn (bool): ${use_cudnn_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: 'use_cudnn' is not a bool value. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.pool2d( + input=data, + pool_size=[3, 3], + pool_type='max', + require_index=True) + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3: + raise ValueError( + "'pool_size' should be a list or tuple with length as 3.") + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False.") + + if pool_type == "max": + l_type = 'max_pool3d_with_index' + else: + l_type = "pool3d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "use_cudnn": use_cudnn, + "adaptive": True, + }) + + return pool_out + + def batch_norm(input, act=None, is_test=False, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a866..9785b5063cd 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -233,6 +233,28 @@ class TestBook(unittest.TestCase): pool_stride=[1, 2], pool_padding=(2, 1))) + def test_adaptive_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool2d( + x, [3, 3], require_index=True)) + self.assertIsNotNone( + layers.adaptive_pool2d( + x, [3, 3], pool_type='avg')) + + def test_adaptive_pool3d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool3d( + x, [3, 3, 3], require_index=True)) + self.assertIsNotNone( + layers.adaptive_pool3d( + x, [3, 3, 3], pool_type='avg')) + def test_lstm_unit(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 47b2e71a4e5..5ccdf082e8a 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,29 +22,47 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -56,27 +75,37 @@ def avg_pool2D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] - field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \ - else (ksize[0] * ksize[1]) + field_size = ((r_end - r_start) * (c_end - c_start)) \ + if (exclusive or adaptive) else (ksize[0] * ksize[1]) out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size return out @@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool2D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool2D_Op): def init_test_case(self): @@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index f05f8ccb398..47a5b2d1abe 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,35 +22,59 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -62,33 +87,49 @@ def avg_pool3D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \ - if exclusive else ksize[0] * ksize[1] * ksize[2] + if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2] out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size return out @@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool3D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool3d_Op): def init_test_case(self): @@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py index 488ff431d4f..6575c408eea 100644 --- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py @@ -13,33 +13,62 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np from op_test import OpTest -def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + +def max_pool3D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, D, H, W = x.shape if global_pool: ksize = [D, H, W] paddings = [0, 0, 0] - D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) mask = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -58,23 +87,37 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): return out, mask -def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def max_pool2D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, H, W = x.shape if global_pool: ksize = [H, W] paddings = [0, 0] - H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) mask = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest): def setUp(self): self.init_test_case() self.init_global() + self.init_adaptive() input = np.random.random(self.shape).astype("float32") output, mask = self.pool_forward_naive(input, self.ksize, self.strides, - self.paddings, self.global_pool) + self.paddings, self.global_pool, + self.adaptive) output = output.astype("float32") mask = mask.astype("int32") @@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest): 'paddings': self.paddings, 'ksize': self.ksize, 'global_pooling': self.global_pool, + 'adaptive': self.adaptive, } self.inputs = {'X': input} @@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest): def init_global(self): self.global_pool = False + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestMaxPoolWithIndex_Op): def init_global(self): @@ -190,5 +239,15 @@ class TestCase7(TestCase6): self.global_pool = False +class TestCastAdaptive2d(TestCase6): + def init_adaptive(self): + self.adaptive = True + + +class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() -- GitLab From cf06e50f1d2b4ceca197e41c2c17a71783c5bc04 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 3 Dec 2018 20:04:49 +0800 Subject: [PATCH 0126/2367] add doc for adaptive pool. test=develop --- paddle/fluid/operators/pool_op.cc | 39 ++++++++++++++++++++ paddle/fluid/operators/pool_with_index_op.cc | 11 ++++++ 2 files changed, 50 insertions(+) diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 11b5c493230..a2f5f811abe 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -277,6 +277,14 @@ Example: Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} $$ + For adaptive = true: + $$ + hstart = floor(i * H_{in} / H_{out}) + hend = ceil((i + 1) * H_{in} / H_{out}) + wstart = floor(j * W_{in} / W_{out}) + wend = ceil((j + 1) * W_{in} / W_{out}) + Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} + $$ )DOC"); } @@ -396,6 +404,37 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ + For exclusive = true: + $$ + dstart = i * strides[0] - paddings[0] + dend = dstart + ksize[0] + hstart = j * strides[1] - paddings[1] + hend = hstart + ksize[1] + wstart = k * strides[2] - paddings[2] + wend = wstart + ksize[2] + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} + $$ + For exclusive = false: + $$ + dstart = max(0, i * strides[0] - paddings[0]) + dend = min(D, dstart + ksize[0]) + hstart = max(0, j * strides[1] - paddings[1]) + hend = min(H, hstart + ksize[1]) + wstart = max(0, k * strides[2] - paddings[2]) + wend = min(W, wstart + ksize[2]) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ + + For adaptive = true: + $$ + dstart = floor(i * D_{in} / D_{out}) + dend = ceil((i + 1) * D_{in} / D_{out}) + hstart = floor(j * H_{in} / H_{out}) + hend = ceil((j + 1) * H_{in} / H_{out}) + wstart = floor(k * W_{in} / W_{out}) + wend = ceil((k + 1) * W_{in} / W_{out}) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ )DOC"); } diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index f9e25277e5c..5354b485bda 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -182,6 +182,12 @@ Example: H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ + + For adaptive = true: + $$ + H_{out} = ksize[0] W_{out} = ksize[1] + $$ + )DOC"); } @@ -267,6 +273,11 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$ + + For adaptive = true: + $$ + D_{out} = ksize[0] H_{out} = ksize[1] W_{out} = ksize[2] + $$ )DOC"); } -- GitLab From a81fabd3273ae0cba9988da612dd5241aeec823f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 11 Dec 2018 13:59:56 +0800 Subject: [PATCH 0127/2367] fix doc errors. test=develop --- paddle/fluid/operators/math/pooling.cc | 70 +++---- paddle/fluid/operators/math/pooling.cu | 182 +++++++++--------- paddle/fluid/operators/math/pooling.h | 12 ++ python/paddle/fluid/layers/nn.py | 26 ++- .../fluid/tests/unittests/test_layers.py | 13 +- 5 files changed, 154 insertions(+), 149 deletions(-) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index b4ee82add31..30873e9f87f 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -19,16 +19,6 @@ namespace paddle { namespace operators { namespace math { -static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - /* * All tensors are in NCHW format. * Ksize, strides, paddings are two elements. These two elements represent @@ -67,8 +57,8 @@ class Pool2dFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -76,8 +66,8 @@ class Pool2dFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -144,8 +134,8 @@ class Pool2dGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -153,8 +143,8 @@ class Pool2dGradFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -319,8 +309,8 @@ class Pool3dFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -328,8 +318,8 @@ class Pool3dFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -337,8 +327,8 @@ class Pool3dFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -417,8 +407,8 @@ class Pool3dGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -426,8 +416,8 @@ class Pool3dGradFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -435,8 +425,8 @@ class Pool3dGradFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -615,8 +605,8 @@ class MaxPool2dWithIndexFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -624,8 +614,8 @@ class MaxPool2dWithIndexFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -753,8 +743,8 @@ class MaxPool3dWithIndexFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -762,8 +752,8 @@ class MaxPool3dWithIndexFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -771,8 +761,8 @@ class MaxPool3dWithIndexFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 5f3b82ed553..efce3f899a4 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -21,18 +21,6 @@ namespace paddle { namespace operators { namespace math { -__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size, - int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size, - int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - template __global__ void KernelPool2D(const int nthreads, const T* input_data, const int channels, const int input_height, @@ -52,11 +40,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int hstart, hend; int wstart, wend; if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { hstart = ph * stride_height - padding_height; hend = min(hstart + ksize_height, input_height); @@ -91,28 +79,29 @@ __global__ void KernelPool2DGrad( PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; int phstart, phend; int pwstart, pwend; if (adaptive) { - phstart = offsetH * output_height / input_height; + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { - phstart = (offsetH < ksize_height) + phstart = (h_offset < ksize_height) ? 0 - : (offsetH - ksize_height) / stride_height + 1; - pwstart = (offsetW < ksize_width) + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) ? 0 - : (offsetW - ksize_width) / stride_width + 1; - phend = min(offsetH / stride_height + 1, output_height); - pwend = min(offsetW / stride_width + 1, output_width); + : (w_offset - ksize_width) / stride_width + 1; + phend = min(h_offset / stride_height + 1, output_height); + pwend = min(w_offset / stride_width + 1, output_width); } T gradient = 0; T input = input_data[index]; @@ -414,14 +403,14 @@ __global__ void KernelPool3D( int hstart, hend; int wstart, wend; if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { dstart = pd * stride_depth - padding_depth; hstart = ph * stride_height - padding_height; @@ -464,9 +453,9 @@ __global__ void KernelPool3DGrad( bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; - int offsetD = + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; + int d_offset = (index / input_width / input_height) % input_depth + padding_depth; int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; @@ -475,26 +464,28 @@ __global__ void KernelPool3DGrad( int phstart, phend; int pwstart, pwend; if (adaptive) { - pdstart = offsetD * output_depth / input_depth; - pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); - phstart = offsetH * output_height / input_height; + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { - pdstart = (offsetD < ksize_depth) + pdstart = (d_offset < ksize_depth) ? 0 - : (offsetD - ksize_depth) / stride_depth + 1; - phstart = (offsetH < ksize_height) + : (d_offset - ksize_depth) / stride_depth + 1; + phstart = (h_offset < ksize_height) ? 0 - : (offsetH - ksize_height) / stride_height + 1; - pwstart = (offsetW < ksize_width) + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) ? 0 - : (offsetW - ksize_width) / stride_width + 1; - pdend = min((offsetD) / stride_depth + 1, output_depth); - phend = min((offsetH) / stride_height + 1, output_height); - pwend = min((offsetW) / stride_width + 1, output_width); + : (w_offset - ksize_width) / stride_width + 1; + pdend = min((d_offset) / stride_depth + 1, output_depth); + phend = min((h_offset) / stride_height + 1, output_height); + pwend = min((w_offset) / stride_width + 1, output_width); } T gradient = 0; @@ -795,11 +786,11 @@ __global__ void KernelMaxPool2dWithIdx( int hstart, hend; int wstart, wend; if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { hstart = ph * stride_height - padding_height; hend = min(hstart + ksize_height, input_height); @@ -837,35 +828,36 @@ __global__ void KernelMaxPool2DWithIdxGrad( T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width; - int offsetH = (index / input_width) % input_height; + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; int phstart, phend; int pwstart, pwend; if (adaptive) { - phstart = offsetH * output_height / input_height; + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { phstart = - (offsetH + padding_height < ksize_height) + (h_offset + padding_height < ksize_height) ? 0 - : (offsetH + padding_height - ksize_height) / stride_height + 1; + : (h_offset + padding_height - ksize_height) / stride_height + 1; pwstart = - (offsetW + padding_width < ksize_width) + (w_offset + padding_width < ksize_width) ? 0 - : (offsetW + padding_width - ksize_width) / stride_width + 1; + : (w_offset + padding_width - ksize_width) / stride_width + 1; phend = - min((offsetH + padding_height) / stride_height + 1, output_height); - pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } T1 gradient = 0; - int input_current_featuremap_idx = offsetH * input_width + offsetW; + int input_current_featuremap_idx = h_offset * input_width + w_offset; int output_idx = (batch_idx * channels + offsetC) * output_height * output_width; @@ -1000,14 +992,14 @@ __global__ void KernelMaxPool3DWithIdx( int hstart, hend; int wstart, wend; if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { dstart = pd * stride_depth - padding_depth; hstart = ph * stride_height - padding_height; @@ -1051,9 +1043,9 @@ __global__ void KernelMaxPool3DWithIdxGrad( const int padding_width, bool adaptive, T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width; - int offsetH = (index / input_width) % input_height; - int offsetD = (index / input_width / input_height) % input_depth; + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int d_offset = (index / input_width / input_height) % input_depth; int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; @@ -1061,35 +1053,37 @@ __global__ void KernelMaxPool3DWithIdxGrad( int phstart, phend; int pwstart, pwend; if (adaptive) { - pdstart = offsetD * output_depth / input_depth; - pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); - phstart = offsetH * output_height / input_height; + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { pdstart = - (offsetD + padding_depth < ksize_depth) + (d_offset + padding_depth < ksize_depth) ? 0 - : (offsetD + padding_depth - ksize_depth) / stride_depth + 1; + : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; phstart = - (offsetH + padding_height < ksize_height) + (h_offset + padding_height < ksize_height) ? 0 - : (offsetH + padding_height - ksize_height) / stride_height + 1; + : (h_offset + padding_height - ksize_height) / stride_height + 1; pwstart = - (offsetW + padding_width < ksize_width) + (w_offset + padding_width < ksize_width) ? 0 - : (offsetW + padding_width - ksize_width) / stride_width + 1; - pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth); + : (w_offset + padding_width - ksize_width) / stride_width + 1; + pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth); phend = - min((offsetH + padding_height) / stride_height + 1, output_height); - pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } T1 gradient = 0; int input_current_feature_map_idx = - (offsetD * input_height + offsetH) * input_width + offsetW; + (d_offset * input_height + h_offset) * input_width + w_offset; int output_idx = (batch_idx * channels + offsetC) * output_depth * output_height * output_width; mask += output_idx; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index d123af8924b..e1f8e6df1d1 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -68,6 +68,18 @@ class AvgPoolGrad { } }; +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + /* * \brief Getting pooling results, and calculating gradient. * diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 61794f0d49a..07fc4ccc6bc 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2506,7 +2506,7 @@ def adaptive_pool2d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=True, + use_cudnn=False, name=None): """ ${comment} @@ -2521,7 +2521,7 @@ def adaptive_pool2d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool): ${use_cudnn_comment} + use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2531,6 +2531,7 @@ def adaptive_pool2d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. ValueError: 'use_cudnn' is not a bool value. + ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2540,11 +2541,11 @@ def adaptive_pool2d(input, data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.pool2d( + pool_out = fluid.layers.adaptive_pool2d( input=data, pool_size=[3, 3], pool_type='max', - require_index=True) + require_index=False) """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -2565,6 +2566,9 @@ def adaptive_pool2d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False.") + if use_cudnn: + raise ValueError("adaptive pool currently not supported in cudnn.") + if pool_type == "max": l_type = 'max_pool2d_with_index' else: @@ -2590,7 +2594,7 @@ def adaptive_pool2d(input, "adaptive": True, }) - return pool_out + return (pool_out, mask) if require_index else pool_out @templatedoc(op_type="pool3d") @@ -2598,7 +2602,7 @@ def adaptive_pool3d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=True, + use_cudnn=False, name=None): """ ${comment} @@ -2613,7 +2617,7 @@ def adaptive_pool3d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool): ${use_cudnn_comment} + use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2623,6 +2627,7 @@ def adaptive_pool3d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. ValueError: 'use_cudnn' is not a bool value. + ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2632,7 +2637,7 @@ def adaptive_pool3d(input, data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.pool2d( + pool_out, mask = fluid.layers.adaptive_pool3d( input=data, pool_size=[3, 3], pool_type='max', @@ -2657,6 +2662,9 @@ def adaptive_pool3d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False.") + if use_cudnn: + raise ValueError("adaptive pool currently not supported in cudnn.") + if pool_type == "max": l_type = 'max_pool3d_with_index' else: @@ -2682,7 +2690,7 @@ def adaptive_pool3d(input, "adaptive": True, }) - return pool_out + return (pool_out, mask) if require_index else pool_out def batch_norm(input, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9785b5063cd..030bf012fa5 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -237,23 +237,24 @@ class TestBook(unittest.TestCase): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') - self.assertIsNotNone( - layers.adaptive_pool2d( - x, [3, 3], require_index=True)) self.assertIsNotNone( layers.adaptive_pool2d( x, [3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_adaptive_pool3d(self): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32') - self.assertIsNotNone( - layers.adaptive_pool3d( - x, [3, 3, 3], require_index=True)) self.assertIsNotNone( layers.adaptive_pool3d( x, [3, 3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool3d( + x, [3, 3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_lstm_unit(self): program = Program() -- GitLab From 1870262ba9a1f951e5181b7bbee1286d5bfa9dd3 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 11 Dec 2018 14:03:04 +0800 Subject: [PATCH 0128/2367] pserver should crash early whe has problem test=develop --- paddle/fluid/operators/distributed/brpc_client.cc | 2 +- paddle/fluid/operators/distributed/grpc_client.cc | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index b394c678fb6..350969f74be 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -158,7 +158,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { - LOG(ERROR) << "Fail to initialize channel"; + LOG(FATAL) << "Fail to initialize channel"; return nullptr; } diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 857214aa211..f14dfcdb238 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -390,8 +390,7 @@ void GRPCClient::Proceed() { VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - // FIXME(gongwb): parse error_details? - LOG(ERROR) << c->GetVarHandlePtr()->String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error, error_code:" << c->status_.error_code() << " error_message:" << c->status_.error_message() << " error_details:" << c->status_.error_details(); -- GitLab From 5e609069966e21e9db89d1dea80bf22b0d766e6a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 14:22:22 +0800 Subject: [PATCH 0129/2367] Fix compile error test=develop --- paddle/fluid/platform/cuda_helper_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index ee45afab93d..466bf90c63c 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -93,7 +93,7 @@ TEST(CudaAtomic, float16) { // unalignment of uint8 void TestUnalign(size_t num, const int shift_bit) { - PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2"); + ASSERT_EQ(num % 2, 0); float16 *in1, *in2, *out; float16 *d_in1, *d_in2; size_t size = sizeof(uint8_t) * (num + shift_bit); -- GitLab From 729684007d70cad38e9d34317748e3fedd477886 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 14:44:02 +0800 Subject: [PATCH 0130/2367] stop server out of run from file --- python/paddle/fluid/async_executor.py | 14 +++++++------- python/paddle/fluid/distributed/ps_instance.py | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 76fdb5b0e26..787a6a6b9e1 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -86,7 +86,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) - self.instance = ps_instance.PaddlePSInstance("init_param", 1, 2) + self.instance = ps_instance.PaddlePSInstance(1, 2) def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): """ @@ -151,10 +151,7 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - self.instance.barrier_all() #worker do all things - if self.instance.is_first_worker(): - self.executor.stop_server() - self.instance.barrier_all() #sync + def config_distributed_nodes(self, dist_opt): @@ -167,8 +164,11 @@ class AsyncExecutor(object): def get_instance(self): return self.instance - #def stop_server(self): - # self.executor.stop_server() + def stop_server(self): + self.instance.barrier_all() #worker do all things + if self.instance.is_first_worker(): + self.executor.stop_server() + self.instance.barrier_all() #sync def init_server(self, dist_desc): self.executor.init_server(dist_desc, self.instance._rankid) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b4045327e1b..94e123c2ceb 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -5,9 +5,8 @@ import sys class PaddlePSInstance(object): - def __init__(self, init_param, server_worker_mode, proc_per_node): + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() - self._config = init_param self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node -- GitLab From 900c789a35798cf73b67a2bb7b7944f3110c7bda Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 10 Dec 2018 12:00:46 +0000 Subject: [PATCH 0131/2367] use jitcode and use vmul --- paddle/fluid/operators/jit/gen/blas.cc | 26 ++++++++---- paddle/fluid/operators/jit/gen/blas.h | 22 +++++----- paddle/fluid/operators/jit/gen/jitcode.cc | 19 +-------- paddle/fluid/operators/jit/gen/jitcode.h | 10 ++--- paddle/fluid/operators/jit/gen_base.cc | 5 +++ paddle/fluid/operators/jit/gen_base.h | 51 +++++++++++++---------- paddle/fluid/operators/jit/kernel_pool.cc | 5 +++ paddle/fluid/operators/jit/kernel_pool.h | 49 ++++++++++++++++++---- paddle/fluid/operators/jit/registry.h | 25 ++++++++++- paddle/fluid/operators/jit/test.cc | 1 + 10 files changed, 137 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 4a8b4554c8b..3e5ce540647 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/blas.h" #include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -103,17 +104,24 @@ void VXXJitCode::genCode() { ret(); } -} // namespace gen - -template <> -std::unique_ptr CreateJitCode(int attr) { - if (UseJitCode(attr)) { - return make_unique( - attr, CodeSize(attr)); +class VMulCreator : public JitCodeCreator { + public: + bool UseMe(const int& attr) const override { + return platform::MayIUse(platform::avx); } - return nullptr; -} + size_t CodeSize(const int& d) const override { + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; + } + std::unique_ptr CreateJitCode(const int& attr) const override { + return make_unique(attr, CodeSize(attr)); + } +}; +} // namespace gen } // namespace jit } // namespace operators } // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(vmul, gen::VMulCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index edc05f86a03..60f32805678 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -25,7 +25,18 @@ namespace gen { // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { public: - const char* name() const override { + explicit VXXJitCode(int d, operand_type type, int scalar_index, + bool with_relu, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), + num_(d), + type_(type), + scalar_index_(scalar_index), + with_relu_(with_relu) { + this->genCode(); + } + + virtual const char* name() const { std::string base = "VXXJitCode"; if (scalar_index_ == 1) { base += "_Scalar"; @@ -45,15 +56,6 @@ class VXXJitCode : public JitCode { base += (with_relu_ ? "_Relu" : ""); return base.c_str(); } - explicit VXXJitCode(int d, operand_type type, int scalar_index, - bool with_relu, size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), - num_(d), - type_(type), - scalar_index_(scalar_index), - with_relu_(with_relu) {} - // static bool init(int d, int scalar_index = 0); void genCode() override; private: diff --git a/paddle/fluid/operators/jit/gen/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc index 93204d340e9..7aaf6a2ff65 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.cc +++ b/paddle/fluid/operators/jit/gen/jitcode.cc @@ -16,23 +16,6 @@ namespace paddle { namespace operators { -namespace jit { - -template <> -size_t GetKey(int d) { - return d; -} - -// template <> -// std::shared_ptr CreateJitCode(int attr) -// { -// if (UseJitCode(attr)) { -// return std::make_shared>(attr, -// CodeSize(attr))); -// } -// return nullptr; -// } - -} // namespace jit +namespace jit {} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 52b8da9a82a..caa3ef9dda7 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -70,9 +70,10 @@ typedef enum { class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) - : Xbyak::CodeGenerator(code_size, code_ptr) { - this->genCode(); - } + : Xbyak::CodeGenerator(code_size, code_ptr) {} + + virtual const char* name() const = 0; + virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } const unsigned char* getCodeInternal() override { @@ -80,9 +81,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { return code; } - virtual const char* name() const = 0; - virtual void genCode() = 0; - protected: Xbyak::Reg64 param1{abi_param1}; const int EVEX_max_8b_offt = 0x200; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 310da0c76f1..a8bf9029637 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -23,6 +23,11 @@ namespace paddle { namespace operators { namespace jit { +template <> +size_t JitCodeKey(int d) { + return d; +} + // refer do not need useme, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 4a136534dca..3b874cf2b01 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -15,9 +15,8 @@ #pragma once #include -#include // for shared_ptr +#include // for unique_ptr #include "paddle/fluid/operators/jit/kernel_base.h" -#include "paddle/fluid/platform/macros.h" DECLARE_bool(dump_jitcode); @@ -25,29 +24,12 @@ namespace paddle { namespace operators { namespace jit { -// TODO(TJ): make these functions as virtual of a class - -// Every JitCode should estimate the code size itself -template -size_t CodeSize(Attr attr) { - return 4096; -} - -// Every JitCode should have a condition when to use this JitCode -template -bool UseJitCode(Attr attr) { - return false; -} - -// Every JitCode should have a method to get the key from attribution -template -size_t GetKey(Attr attr); - class GenBase : public Kernel { public: + virtual ~GenBase() = default; virtual const char* name() const = 0; - virtual const unsigned char* getCodeInternal() = 0; virtual size_t getSize() const = 0; + virtual const unsigned char* getCodeInternal() = 0; template const FUNC getCode() { const unsigned char* code = this->getCodeInternal(); @@ -61,8 +43,31 @@ class GenBase : public Kernel { void dumpCode(const unsigned char* code) const; }; -template -std::unique_ptr CreateJitCode(Attr attr); +// Every JitCode should have a method to get the key from attribution +template +size_t JitCodeKey(Attr attr); + +// Creator is used to creat the jitcode and save in pool. +// Every JitCode should have one creator. +class GenCreator { + public: + virtual ~GenCreator() = default; +}; + +template +class JitCodeCreator : public GenCreator { + public: + virtual ~JitCodeCreator() = default; + + // condition when this jit code can be used. + virtual bool UseMe(const Attr& attr) const = 0; + + // estimate this code size + virtual size_t CodeSize(const Attr& attr) const = 0; + + // create this code + virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; +}; } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc index f300d28a6f0..bc98c644fbe 100644 --- a/paddle/fluid/operators/jit/kernel_pool.cc +++ b/paddle/fluid/operators/jit/kernel_pool.cc @@ -21,6 +21,11 @@ namespace paddle { namespace operators { namespace jit { +JitCodeCreatorPool& JitCodeCreatorPool::Instance() { + static JitCodeCreatorPool g_creator_pool; + return g_creator_pool; +} + KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; return g_kernel_pool; diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 737b7f60e3c..c9e7fc84e51 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -14,7 +14,7 @@ #pragma once -#include // for shared_ptr +#include // for unique_ptr #include #include #include @@ -52,6 +52,28 @@ class JitCodePool { DISABLE_COPY_AND_ASSIGN(JitCodePool); }; +class JitCodeCreatorPool { + typedef std::unique_ptr GenCreatorPtr; + typedef std::unordered_map, + KernelKey::Hash> + GenCreatorPtrMap; + + public: + JitCodeCreatorPool() = default; + static JitCodeCreatorPool& Instance(); + GenCreatorPtrMap& AllCreators() { return creators_; } + void Insert(const KernelKey& key, GenCreatorPtr value) { + if (creators_.find(key) == creators_.end()) { + creators_.emplace(key, std::vector()); + } + creators_.at(key).emplace_back(std::move(value)); + } + + private: + GenCreatorPtrMap creators_; + DISABLE_COPY_AND_ASSIGN(JitCodeCreatorPool); +}; + typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> KernelMap; @@ -113,24 +135,33 @@ inline Func GetRefer() { template const Func Get(Attr attr) { - size_t key = GetKey(attr); + size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } + KernelKey kkey(KT, PlaceType()); if (std::is_same::value) { - auto p = CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } + } } } - // pool: (KernelKey(type, place), vector) + // pool: (KernelKey(type, place), vector) auto& pool = KernelPool().Instance().AllKernels(); - KernelKey kkey(KT, PlaceType()); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index c1f02d9cd57..cb32c487208 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -116,7 +116,30 @@ class JitKernelRegistrar { #define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) -// REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); +#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ + "REGISTER_JITKERNEL_GEN must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ + TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::JitCodeCreatorPool, \ + ::paddle::platform::CPUPlace, __VA_ARGS__> \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ + ::paddle::operators::jit::KernelType::kernel_type); \ + int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ + return 0; \ + } + +#define USE_JITKERNEL_GEN(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ + "USE_JITKERNEL_GEN must be called in global namespace"); \ + extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ + static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ + TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() #define USE_JITKERNEL_REFER(kernel_type) \ STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 836b6eee800..5af9ed697d6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -61,6 +61,7 @@ void ExpectEQ(const T* target, const T* refer, int n) { // TODO(TJ): remove me USE_JITKERNEL_MORE(vmul, mkl); USE_JITKERNEL_REFER(vmul); +USE_JITKERNEL_GEN(vmul); TEST(JitKernel, vmul) { using T = float; -- GitLab From d4cab7d94890c3bf43d20243ea9f21722b2738a3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 07:30:53 +0000 Subject: [PATCH 0132/2367] use jitkernel in one file --- paddle/fluid/operators/jit/CMakeLists.txt | 13 +-- paddle/fluid/operators/jit/gen/CMakeLists.txt | 7 ++ paddle/fluid/operators/jit/gen/jitcode.cc | 21 ---- paddle/fluid/operators/jit/helper.h | 96 +++++++++++++++++++ paddle/fluid/operators/jit/kernel_base.h | 2 +- paddle/fluid/operators/jit/kernel_pool.h | 63 ------------ .../fluid/operators/jit/more/CMakeLists.txt | 3 + .../operators/jit/more/mkl/CMakeLists.txt | 3 + paddle/fluid/operators/jit/more/mkl/mkl.h | 4 +- .../fluid/operators/jit/refer/CMakeLists.txt | 7 ++ paddle/fluid/operators/jit/refer/refer.h | 4 +- paddle/fluid/operators/jit/test.cc | 18 +--- 12 files changed, 133 insertions(+), 108 deletions(-) delete mode 100644 paddle/fluid/operators/jit/gen/jitcode.cc create mode 100644 paddle/fluid/operators/jit/helper.h diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 77fd27666f2..26903e0e44e 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -1,16 +1,17 @@ -# set(use_jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) -# file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") -# file(APPEND ${pass_file} "\#pragma once\n") -# file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") - +set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) +file(WRITE ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${jit_file} "\#pragma once\n") +file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") +file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -list(REMOVE_ITEM jit_kernel_cc_srcs jit_test.cc) +list(REMOVE_ITEM jit_kernel_cc_srcs test.cc) cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) +# refer must go first add_subdirectory(refer) add_subdirectory(more) if(WITH_XBYAK) diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index c678ea33b8e..98d9231faa6 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -3,3 +3,10 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) + +function(USE_JITKERNEL_GEN TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") +endfunction() + +# use gen jitcode kernel by name +USE_JITKERNEL_GEN(vmul) diff --git a/paddle/fluid/operators/jit/gen/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc deleted file mode 100644 index 7aaf6a2ff65..00000000000 --- a/paddle/fluid/operators/jit/gen/jitcode.cc +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "paddle/fluid/operators/jit/gen/jitcode.h" - -namespace paddle { -namespace operators { -namespace jit {} // namespace jit -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h new file mode 100644 index 00000000000..c8da960a1e1 --- /dev/null +++ b/paddle/fluid/operators/jit/helper.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for unique_ptr +#include +#include +#include +#include "paddle/fluid/operators/jit/gen_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_key.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jit { + +// Refer code do not related with attr, and always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} + +template +const Func Get(Attr attr) { + size_t key = JitCodeKey(attr); + auto& codes = JitCodePool().Instance(); + if (codes.Has(key)) { + return codes.AllKernels().at(key)->template getCode(); + } + + KernelKey kkey(KT, PlaceType()); + if (std::is_same::value) { + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } + } + } + } + + // pool: (KernelKey(type, place), vector) + auto& pool = KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + return i->GetFunc(); + } + } + } + + // The last implementation should be reference function on CPUPlace. + return GetRefer(); +} + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 6a789c52c37..df7be6ab8ec 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -22,7 +22,7 @@ namespace jit { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; template -struct VMulTypes { +struct VMulTuples { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index c9e7fc84e51..3e15242af28 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -114,69 +114,6 @@ class ReferKernelPool { DISABLE_COPY_AND_ASSIGN(ReferKernelPool); }; -// Refer code do not related with attr, and always on CPUPlace -template -inline Func GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); - auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); - auto& ref_impls = ref_iter->second; - for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); - if (i) { - return i->GetFunc(); - } - } - return nullptr; -} - -template -const Func Get(Attr attr) { - size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); - if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); - } - - KernelKey kkey(KT, PlaceType()); - if (std::is_same::value) { - // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); - auto iter = creator_map.find(kkey); - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; - } - } - } - } - - // pool: (KernelKey(type, place), vector) - auto& pool = KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); - } - } - } - - // The last implementation should be reference function on CPUPlace. - return GetRefer(); -} - } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt index 84f1811ced2..5bb78b93045 100644 --- a/paddle/fluid/operators/jit/more/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/CMakeLists.txt @@ -1,4 +1,7 @@ +function(USE_JITKERNEL_MORE TARGET TYPE) + file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") +endfunction() if(WITH_MKLML) add_subdirectory(mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 94d2487866b..0c15c7060d2 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -1,3 +1,6 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) + +# use mkl kernels by name and type +USE_JITKERNEL_MORE(vmul, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 45cfec1c477..c0f738ccebe 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,8 +28,8 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl::func_type, - typename VMulTypes::attr_type> { +class VMulKernel : public KernelImpl::func_type, + typename VMulTuples::attr_type> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 8c116e42dc6..b6ff80d03df 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -1,3 +1,10 @@ cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) + +function(USE_JITKERNEL_REFER TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") +endfunction() + +# use refer kernel by name +USE_JITKERNEL_REFER(vmul) diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 76a663633d1..97aa5de8fcf 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -29,8 +29,8 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel::func_type, - typename VMulTypes::attr_type> { +class VMulKernel : public ReferKernel::func_type, + typename VMulTuples::attr_type> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 5af9ed697d6..e531ba1a2c4 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,10 +19,7 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jit/kernel_pool.h" -// TODO(TJ): remove me -#include "paddle/fluid/operators/jit/registry.h" - +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" @@ -58,11 +55,6 @@ void ExpectEQ(const T* target, const T* refer, int n) { } } -// TODO(TJ): remove me -USE_JITKERNEL_MORE(vmul, mkl); -USE_JITKERNEL_REFER(vmul); -USE_JITKERNEL_GEN(vmul); - TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; @@ -70,10 +62,10 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jit; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer::func_type, - jit::VMulTypes::attr_type>(); - auto tgt = jit::Get::func_type, - jit::VMulTypes::attr_type, PlaceType>(d); + auto ref = jit::GetRefer::func_type, + jit::VMulTuples::attr_type>(); + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(tgt != nullptr); -- GitLab From 9f8d86858ea6156422c1cf2c59fdb516cb3f7868 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 10:43:50 +0800 Subject: [PATCH 0133/2367] Revert data_type test=develop --- paddle/fluid/framework/data_type.cc | 15 +++++++-------- paddle/fluid/framework/parallel_executor.cc | 3 +++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 1c29a89bffa..28f3da88fa1 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/framework/data_type.h" #include -#include #include #include @@ -24,10 +23,10 @@ namespace paddle { namespace framework { struct DataTypeMap { - std::map cpp_to_proto_; + std::unordered_map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::map cpp_to_size_; + std::unordered_map cpp_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -44,9 +43,9 @@ static inline void RegisterType(DataTypeMap* map, proto::VarType::Type proto_type, const std::string& name) { map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); - map->cpp_to_proto_.emplace(typeid(T).name(), proto_type); + map->cpp_to_proto_.emplace(typeid(T), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T).name(), sizeof(T)); + map->cpp_to_size_.emplace(typeid(T), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -72,7 +71,7 @@ static DataTypeMap* InitDataTypeMap() { } proto::VarType::Type ToDataType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type.name()); + auto it = gDataTypeMap().cpp_to_proto_.find(type); if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } @@ -98,8 +97,8 @@ std::string DataTypeToString(const proto::VarType::Type type) { } size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type.name()); - if (LIKELY(it != gDataTypeMap().cpp_to_size_.end())) { + auto it = gDataTypeMap().cpp_to_size_.find(type); + if (it != gDataTypeMap().cpp_to_size_.end()) { return it->second; } PADDLE_THROW("Not support %s as tensor type", type.name()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 0636b89048f..28a4b14b27b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -39,8 +39,11 @@ DEFINE_string(pe_profile_fname, "", namespace paddle { namespace framework { + static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS static bool gProfileStarted = false; +#endif class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) -- GitLab From 92daace55ca25aa9091d402560c90654c0cf183d Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Mon, 10 Dec 2018 15:03:04 +0100 Subject: [PATCH 0134/2367] MKL-DNN Concat: Fix segfault related to referencing deleter memory primitive test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index b8456aac9da..7ad674056f0 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -80,8 +80,8 @@ class ConcatPrimitiveFactory { concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, Tensor* output, platform::CPUPlace place) { CreateSourcePrimitiveAts(); - auto dst_mem = CreateDstMemory(concat_pd, output, place); - return concat(concat_pd, inputs, dst_mem); + dst_mem = CreateDstMemory(concat_pd, output, place); + return concat(concat_pd, inputs, dst_mem.get()); } private: @@ -118,7 +118,8 @@ class ConcatPrimitiveFactory { std::vector srcs_pd; std::vector srcs; std::vector inputs; -}; + boost::optional dst_mem; // TODO(mgallus): change to std::optional +}; // upon introduction of C++17 to paddle template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { -- GitLab From 57ac412b98990ac1d946ad32de30b07a15d0a18f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 17:48:25 +0800 Subject: [PATCH 0135/2367] download data --- python/paddle/fluid/async_executor.py | 22 ++++++++++++++++++- python/paddle/fluid/contrib/utils/__init__.py | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 787a6a6b9e1..cce7ec5cca1 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -25,6 +25,7 @@ from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc from .distributed import ps_instance +from .contrib.utils import hdfs_utils as hdfs __all__ = ['AsyncExecutor'] @@ -152,6 +153,22 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): + hadoop_home = "$HADOOP_HOME" + + configs = { + "fs.default.name": fs_default_name, + "hadoop.job.ugi": ugi + } + + client = hdfs.HDFSClient(hadoop_home, configs) + downloads = hdfs.multi_download( + client, + afs_path, + local_path, + self.instance.get_worker_index(), + self.instance.get_node_cnt() / 2, + multi_processes=process_num) def config_distributed_nodes(self, dist_opt): @@ -179,10 +196,11 @@ class AsyncExecutor(object): self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model + self.instance.barrier_all() #wait for download_data self.instance.barrier_all() #wait worker do all things self.instance.barrier_all() #sync - def init_worker(self, dist_desc): + def init_worker(self, dist_desc, afs_path, local_path, fs_default_name, ugi): self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) @@ -190,6 +208,8 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.init_model() self.instance.barrier_all() #wait init model + self.download_data(afs_path, local_path, fs_default_name, ugi, process_num=12) + self.instance.barrier_all() #wait for download_data def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 6e479bdc2b9..2fe9f702f3d 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. from __future__ import print_function -from . import lookup_table_utils -from .lookup_table_utils import * +#from . import lookup_table_utils +#from .lookup_table_utils import * from . import hdfs_utils from .hdfs_utils import * -- GitLab From 570338699b2038b802e9d49ea80efc916416477a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 11 Dec 2018 18:29:16 +0800 Subject: [PATCH 0136/2367] Add debug info --- .../details/computation_op_handle.cc | 45 ++++- .../fast_threaded_ssa_graph_executor.cc | 1 + .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/operator.cc | 160 +++++++++++------- paddle/fluid/framework/scope.cc | 37 ++-- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 ++++----- python/paddle/fluid/profiler.py | 3 +- 8 files changed, 239 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600..90030334383 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} +struct RecordTime { + RecordTime(const std::string &name, const std::string &type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + ~RecordTime() { + if (type_ == "elementsize_add") { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + { + RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); + WaitInputVarGenerated(place_); + } + + Scope *scope = nullptr; + { + RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); + scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + { + RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); - auto run_func = [this]() { - op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); - }; + auto run_func = [this, scope]() { op_->Run(*scope, place_); }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e0370..872bc5d654c 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b..5997f12ffab 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7..b8adce4edf1 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +struct RecordTime { + RecordTime(const std::string& name, const std::string& type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + void inline stop() { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + + ~RecordTime() { + if (type_ == "elementwise_add") { + stop(); + } + // stop(); + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", type_); + RecordTime rt("OperatorWithKernel::All", type_); + { + RecordTime rt("OperatorWithKernel::InferShape", type_); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); } - OpKernelMap& kernels = kernels_iter->second; + { + RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + type_); + } - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + OpKernelMap& kernels = kernels_iter->second; - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - auto kernel_iter = kernels.find(expected_kernel_key); + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + Scope* transfer_scope = nullptr; + // auto* transfer_scope = + // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = - (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = scope; + // const Scope& exec_scope = + // (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } + delete rt_1; - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + delete rt_2; - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get().value()); + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, + var->Get().value()); + } } } + delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc..61416676d63 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -43,9 +43,16 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +// TODO(minqiyang): use reader lock and writer lock in all platforms +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK +// #define SCOPE_READER_LOCK boost::shared_lock +// lock(mutex_); +// #define SCOPE_WRITER_LOCK boost::unique_lock +// lock(mutex_); #endif namespace paddle { @@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -118,7 +125,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156..181baac870a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE( - ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s [%s]", - ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); + if (!ctx->IsRuntime()) { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the " + "received is %s [%s]", + ctx->GetInputsVarType("Y").front(), + ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } } ctx->ShareDim("X", /*->*/ "Out"); @@ -125,7 +128,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -135,10 +138,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -152,7 +155,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39ac..bc1b20321f1 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment1"), - "Input(Moment1) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment2"), - "Input(Moment2) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - "Input(Beta1Pow) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - "Input(Beta2Pow) of AdamOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - "Output(Moment1Out) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - "Output(Moment2Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Param"), + // "Input(Param) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Grad"), + // "Input(Grad) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment1"), + // "Input(Moment1) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment2"), + // "Input(Moment2) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + // "Input(LearningRate) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + // "Input(Beta1Pow) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + // "Input(Beta2Pow) of AdamOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + // "Output(ParamOut) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + // "Output(Moment1Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + // "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + // "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - "Beta1 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + // "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - "Beta2 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + // "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - "Param and Grad input of AdamOp should have same dimension"); - } - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment1"), - "Param and Moment1 input of AdamOp should have same dimension"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment2"), - "Param and Moment2 input of AdamOp should have same dimension"); + // if (ctx->GetInputsVarType("Grad")[0] == + // framework::proto::VarType::LOD_TENSOR) { + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Grad"), + // "Param and Grad input of AdamOp should have same dimension"); + // } + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment1"), + // "Param and Moment1 input of AdamOp should have same dimension"); + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment2"), + // "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5b..8df2e01b037 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - core.nvprof_init(output_file, output_mode, config_file) + #Comment this for nvprof + #core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield -- GitLab From 82726402be966ede1e15486d88f9a17c1d1b52b9 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 19:27:49 +0800 Subject: [PATCH 0137/2367] exception safe --- .../details/parallel_ssa_graph_executor.cc | 51 +++++++++++++++---- .../details/parallel_ssa_graph_executor.h | 1 + paddle/fluid/framework/parallel_executor.cc | 15 ------ paddle/fluid/framework/threadpool.h | 1 - 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index dfb40721d88..f1a07edf088 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -34,32 +34,63 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } - VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( const std::vector &fetch_tensors) { - std::vector> run_futures; - FeedFetchList fetch_data; + std::vector> run_futures; + + std::vector fetch_datas; + FeedFetchList ret; + + fetch_datas.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); for (size_t i = 0; i < places_.size(); ++i) { - auto call = [this, i] { - // FIXME(Yancey1989): need to fix fetch data failed. - std::vector empty; - executors_[i]->Run(empty); + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + return executors_[i]->Run(fetch_tensors); }; + if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + try { + fetch_datas.emplace_back(std::move(call())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + break; + } } } + if (pool_) { for (auto &f : run_futures) { - f.wait(); + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + try { + fetch_datas.emplace_back(std::move(f.get())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_datas.at(scope_idx).at(fetch_idx)); } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); } - return fetch_data; + return ret; } } // namespace details diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index 37784775f03..bd777e41f85 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -44,6 +44,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { std::vector> graphs_; std::vector> executors_; + ExceptionHolder exception_holder_; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2a9ca3e815b..82a7bd21859 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -202,21 +202,6 @@ ParallelExecutor::ParallelExecutor( } } } - /** - std::vector> var_infos_list; - for (size_t i = 0; i < graphs.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - var_infos_list.push_back(std::move(var_infos)); - } - **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 5177b7ee029..8fd834be9ac 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include // NOLINT #include #include // NOLINT -- GitLab From 5cc83f79bfa9516ef9c5f7f688f665deb47e0d07 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 21:45:45 +0800 Subject: [PATCH 0138/2367] update by comment --- paddle/fluid/framework/parallel_executor.cc | 29 +++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 82a7bd21859..b0cd1e8e908 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -110,23 +110,30 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - std::unique_ptr nccl_id = nullptr; + ncclUniqueId *nccl_id = nullptr; bool need_group_call = true; - if (nccl_id_var != nullptr) { - nccl_id.reset(nccl_id_var->GetMutable()); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - nccl_id.reset(new ncclUniqueId()); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id.get(); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + // parallel graph mode should initialize nccl by ncclCommInitRank since + // it call nccl operator per device per thread. + if (nccl_id_var == nullptr) { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id; + } else { + nccl_id = nccl_id_var->GetMutable(); + } need_group_call = false; + } else if (nccl_id_var != nullptr) { // the other executor type. + // the distributed training with nccl mode would initialize the nccl id in + // startup_program. + nccl_id = nccl_id_var->GetMutable(); } else { - // init nccl_id in NCCLContextMap + // initlize NCCL by ncclCommInitAll, do not need nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id.get(), num_trainers, trainer_id, - need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif -- GitLab From 10ed9e0a6e3ab06e0b42172126bb8872828cbe60 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 22:03:33 +0800 Subject: [PATCH 0139/2367] download & run & instance --- paddle/fluid/framework/async_executor.cc | 38 ++++++++++++++---------- paddle/fluid/framework/async_executor.h | 3 +- python/paddle/fluid/async_executor.py | 23 ++++++++------ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 45a914b70ea..f0ca375f950 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -191,18 +191,19 @@ void AsyncExecutor::SaveModel(const std::string& path) { } } -void AsyncExecutor::PrepareDenseThread() { - DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO - param.training_thread_num = actual_thread_num; - param.root_scope = root_scope_; - //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO - param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); - _pull_dense_thread->start(); - +void AsyncExecutor::PrepareDenseThread(const std::string& mode) { + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); + } } void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, @@ -210,6 +211,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, + const std::string& mode, const bool debug) { std::vector threads; @@ -251,11 +253,15 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - PrepareDenseThread(); + PrepareDenseThread(mode); std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { - worker.reset(new AsyncExecutorThreadWorker); + if (mode == "mpi") { + worker.reset(new AsyncExecutorThreadWorker); + } else { + worker.reset(new ExecutorThreadWorker); + } } // prepare thread resource here @@ -274,7 +280,9 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - _pull_dense_thread->stop(); + if (mode == "mpi") { + _pull_dense_thread->stop(); + } root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 4b461262173..93010f8a9b0 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -61,6 +61,7 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, + const std::string& mode, const bool debug = false); //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); void InitServer(const std::string& dist_desc, int index); @@ -79,7 +80,7 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); - void PrepareDenseThread(); + void PrepareDenseThread(const std::string& mode); public: std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index cce7ec5cca1..e760d58fd22 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -87,9 +87,8 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) - self.instance = ps_instance.PaddlePSInstance(1, 2) - def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): + def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. Users can also inspect certain variables by naming them in parameter @@ -151,10 +150,11 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, - fetch_var_names, debug) + fetch_var_names, mode, debug) def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): - hadoop_home = "$HADOOP_HOME" + #hadoop_home = "$HADOOP_HOME" + hadoop_home = "~/tools/hadoop-xingtian/hadoop/" configs = { "fs.default.name": fs_default_name, @@ -169,8 +169,11 @@ class AsyncExecutor(object): self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, multi_processes=process_num) + self.instance.barrier_all() #wait for download_data #TODO only barriere worker - def config_distributed_nodes(self, dist_opt): + def config_distributed_nodes(self): + self.instance = ps_instance.PaddlePSInstance(1, 2) + return self.instance # get total rank # get rank index @@ -196,11 +199,15 @@ class AsyncExecutor(object): self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model - self.instance.barrier_all() #wait for download_data + self.instance.barrier_all() #wait for download_data #TODO remove this after only barrier worker self.instance.barrier_all() #wait worker do all things self.instance.barrier_all() #sync - def init_worker(self, dist_desc, afs_path, local_path, fs_default_name, ugi): + def init_worker(self, dist_desc, startup_program): + place = core.CPUPlace() + executor = Executor(place) + executor.run(startup_program) + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) @@ -208,8 +215,6 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.init_model() self.instance.barrier_all() #wait init model - self.download_data(afs_path, local_path, fs_default_name, ugi, process_num=12) - self.instance.barrier_all() #wait for download_data def init_model(self): self.executor.init_model() -- GitLab From acc6ae49b18cb55db4dd84cd09069ebe01a1b54a Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Wed, 12 Dec 2018 00:31:59 +0800 Subject: [PATCH 0140/2367] Fix the issue to run on AVX2 and AVX512F machines (#14851) test=develop --- .../fluid/operators/math/jit_kernel_layer_norm.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index fead13ebadc..cb49e66488b 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -79,16 +79,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } }; -#define INTRIAVX_FLOAT(isa, block) \ +#define INTRIAVX_FLOAT(isa, jit_block) \ template <> \ - LayerNormKernelImpl::LayerNormKernelImpl(int right) \ + LayerNormKernelImpl::LayerNormKernelImpl(int right) \ : LayerNormKernel() { \ this->num_ = right; \ this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -97,6 +97,7 @@ class LayerNormKernelImpl : public LayerNormKernel { __m256 tmp; \ size_t offset; \ size_t j; \ + size_t block = YMM_FLOAT_BLOCK; \ __m256 reverse_num_vec = \ _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_)); \ __m256 epsilon_vec = _mm256_set1_ps(epsilon); \ @@ -221,12 +222,14 @@ INTRIAVX_FLOAT(platform::avx, kEQ8); INTRIAVX_FLOAT(platform::avx, kGT8LT16); INTRIAVX_FLOAT(platform::avx, kEQ16); INTRIAVX_FLOAT(platform::avx, kGT16); -#endif -#ifdef __AVX2__ INTRIAVX_FLOAT(platform::avx2, kEQ8); INTRIAVX_FLOAT(platform::avx2, kGT8LT16); INTRIAVX_FLOAT(platform::avx2, kEQ16); INTRIAVX_FLOAT(platform::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx512f, kEQ8); +INTRIAVX_FLOAT(platform::avx512f, kGT8LT16); +INTRIAVX_FLOAT(platform::avx512f, kEQ16); +INTRIAVX_FLOAT(platform::avx512f, kGT16); #endif #undef INTRIAVX_FLOAT -- GitLab From 28eb7d840c9baaf49303cada7fcc71f557abb78a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 09:29:15 +0000 Subject: [PATCH 0141/2367] test all impls and all inplace cases --- paddle/fluid/operators/jit/helper.h | 53 ++++++++----- paddle/fluid/operators/jit/test.cc | 117 +++++++++++++++++++++------- 2 files changed, 121 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index c8da960a1e1..09a6bc3d9d7 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -28,33 +28,16 @@ namespace paddle { namespace operators { namespace jit { -// Refer code do not related with attr, and always on CPUPlace -template -inline Func GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); - auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); - auto& ref_impls = ref_iter->second; - for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); - if (i) { - return i->GetFunc(); - } - } - return nullptr; -} - template -const Func Get(Attr attr) { + typename PlaceType> +inline const Func GetJitCode(Attr attr) { size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } + // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KT, PlaceType()); if (std::is_same::value) { // pool: (KernelKey(type, place), vector) @@ -73,8 +56,38 @@ const Func Get(Attr attr) { } } } + return nullptr; +} + +// Refer code do not related with attr, which is just for cast +// Refer is always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} + +template +const Func Get(Attr attr) { + auto jitfunc = GetJitCode(attr); + if (jitfunc) { + return jitfunc; + } // pool: (KernelKey(type, place), vector) + KernelKey kkey(KT, PlaceType()); auto& pool = KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e531ba1a2c4..e523089101f 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -55,46 +55,105 @@ void ExpectEQ(const T* target, const T* refer, int n) { } } +std::vector TestSizes() { + std::vector s; + for (int i = 1; i < 30; ++i) { + s.push_back(i); + } + // test some large size + s.push_back(100); + s.push_back(1000); + return s; +} + +template +void TestTartgetFunc(const Func tgt, const std::vector& x, + const std::vector& y, const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); +} + TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jit; - // TODO(TJ): test more vector size - for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer::func_type, + const auto KT = jit::vmul; + for (int d : TestSizes()) { + auto ref = jit::GetRefer::func_type, jit::VMulTuples::attr_type>(); - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); - EXPECT_TRUE(tgt != nullptr); - std::vector x(d), y(d); - std::vector zref(d), ztgt(d); + std::vector x(d), y(d), zref(d); RandomVec(d, x.data()); RandomVec(d, y.data()); - const float* x_data = x.data(); - const float* y_data = y.data(); - float* ztgt_data = ztgt.data(); - float* zref_data = zref.data(); - tgt(x_data, y_data, ztgt_data, d); + std::vector xinp(d), yinp(d); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + std::copy(y.begin(), y.end(), yinp.begin()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* xinp_data = xinp.data(); + T* yinp_data = yinp.data(); + + // test refer code inplace ref(x_data, y_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); - - // test inplace x - std::copy(x.begin(), x.end(), zref.begin()); - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ref(zref_data, y_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); - - // test inplace y - std::copy(y.begin(), y.end(), zref.begin()); - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ref(x_data, zref_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); + ref(x_data, yinp_data, yinp_data, d); + ref(xinp_data, y_data, xinp_data, d); + ExpectEQ(xinp_data, zref_data, d); + ExpectEQ(yinp_data, zref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test jitcode, size: " << d; + TestTartgetFunc::func_type>(jitcode, x, y, zref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast::func_type, + jit::VMulTuples::attr_type>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestTartgetFunc::func_type>(more, x, y, zref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + TestTartgetFunc::func_type>(tgt, x, y, zref); } } -- GitLab From d538513fce27c9442553b39a1cff9e482c922196 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 18:07:07 +0000 Subject: [PATCH 0142/2367] fix the compile error on mac --- paddle/fluid/operators/jit/gen_base.h | 6 +++--- paddle/fluid/operators/jit/helper.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 3b874cf2b01..586f4389c04 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -30,13 +30,13 @@ class GenBase : public Kernel { virtual const char* name() const = 0; virtual size_t getSize() const = 0; virtual const unsigned char* getCodeInternal() = 0; - template - const FUNC getCode() { + template + Func getCode() { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); } - return reinterpret_cast(code); + return reinterpret_cast(const_cast(code)); } protected: diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 09a6bc3d9d7..b7580f6efb4 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -30,7 +30,7 @@ namespace jit { template -inline const Func GetJitCode(Attr attr) { +inline Func GetJitCode(Attr attr) { size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { @@ -80,7 +80,7 @@ inline Func GetRefer() { template -const Func Get(Attr attr) { +Func Get(Attr attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; -- GitLab From 1500c8e621d4d356d27c1483f7068839f8fb66f6 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 09:58:04 +0800 Subject: [PATCH 0143/2367] is instance is None --- python/paddle/fluid/async_executor.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index e760d58fd22..2a6a11805e4 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -87,6 +87,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) + self.instance = None def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ @@ -154,6 +155,9 @@ class AsyncExecutor(object): def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): #hadoop_home = "$HADOOP_HOME" + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') + hadoop_home = "~/tools/hadoop-xingtian/hadoop/" configs = { @@ -182,15 +186,21 @@ class AsyncExecutor(object): pass def get_instance(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') return self.instance def stop_server(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.instance.barrier_all() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() self.instance.barrier_all() #sync def init_server(self, dist_desc): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() self.instance.set_ip(ip) @@ -204,6 +214,8 @@ class AsyncExecutor(object): self.instance.barrier_all() #sync def init_worker(self, dist_desc, startup_program): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') place = core.CPUPlace() executor = Executor(place) executor.run(startup_program) @@ -217,8 +229,12 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait init model def init_model(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.init_model() def save_model(self, save_path): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.save_model(save_path) -- GitLab From 7ec3264b513270fa7a70c2b5fec2166630568a2c Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 12 Dec 2018 10:56:00 +0800 Subject: [PATCH 0144/2367] fix API spec. test=develop --- paddle/fluid/API.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 87ed586aad9..845abe7d5b8 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) -- GitLab From 9bd70a1e0433b5a930c43b1d7d2af67bc72d38a6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 16:32:42 +0800 Subject: [PATCH 0145/2367] Change tensor uses proto::VarType::type test=develop --- .../fluid/framework/data_layout_transform.cc | 6 +- .../fluid/framework/data_layout_transform.h | 16 ++-- paddle/fluid/framework/data_type.cc | 24 ++---- paddle/fluid/framework/data_type.h | 77 +++++++++++-------- paddle/fluid/framework/data_type_test.cc | 6 +- .../framework/details/all_reduce_op_handle.cc | 2 +- .../framework/details/fuse_vars_op_handle.h | 4 +- .../framework/details/reduce_op_handle.cc | 4 +- paddle/fluid/framework/dlpack_tensor.cc | 37 ++++----- paddle/fluid/framework/dlpack_tensor_test.cc | 20 +---- .../fluid/framework/executor_thread_worker.cc | 46 ++++------- paddle/fluid/framework/lod_tensor.cc | 6 +- paddle/fluid/framework/operator.cc | 14 ++-- paddle/fluid/framework/selected_rows.cc | 4 +- paddle/fluid/framework/tensor.cc | 4 +- paddle/fluid/framework/tensor.h | 9 ++- paddle/fluid/framework/tensor_impl.h | 12 ++- paddle/fluid/framework/tensor_util.cc | 10 +-- .../fluid/inference/api/analysis_predictor.cc | 4 +- paddle/fluid/inference/api/api_impl.cc | 4 +- paddle/fluid/inference/api/api_impl_tester.cc | 4 +- paddle/fluid/operators/affine_grid_op.cc | 8 +- paddle/fluid/operators/arg_max_op.cc | 1 - paddle/fluid/operators/arg_max_op.cu | 2 - paddle/fluid/operators/arg_min_op.cc | 1 - paddle/fluid/operators/arg_min_op.cu | 2 - .../fluid/operators/array_to_lod_tensor_op.cc | 4 +- paddle/fluid/operators/attention_lstm_op.cc | 5 +- .../fluid/operators/average_accumulates_op.cc | 5 +- paddle/fluid/operators/batch_norm_op.cc | 20 ++--- .../fluid/operators/beam_search_decode_op.cc | 2 +- paddle/fluid/operators/beam_search_op.cc | 3 +- paddle/fluid/operators/bpr_loss_op.cc | 10 +-- .../controlflow/conditional_block_op.cc | 13 ++-- .../fluid/operators/controlflow/while_op.cc | 2 +- paddle/fluid/operators/conv_op.cc | 12 ++- paddle/fluid/operators/conv_transpose_op.cc | 10 +-- paddle/fluid/operators/crf_decoding_op.cc | 5 +- paddle/fluid/operators/crop_op.cc | 9 +-- paddle/fluid/operators/cross_entropy_op.cc | 10 +-- paddle/fluid/operators/ctc_align_op.cc | 5 +- .../detection/anchor_generator_op.cc | 3 +- .../operators/detection/bipartite_match_op.cc | 5 +- .../detection/density_prior_box_op.cc | 3 +- .../detection/generate_proposals_op.cc | 5 +- .../detection/mine_hard_examples_op.cc | 3 +- .../operators/detection/multiclass_nms_op.cc | 3 +- .../fluid/operators/detection/prior_box_op.cc | 3 +- .../detection/roi_perspective_transform_op.cc | 10 +-- .../detection/rpn_target_assign_op.cc | 3 +- .../operators/detection/target_assign_op.cc | 5 +- paddle/fluid/operators/detection_map_op.cc | 3 +- .../operators/elementwise/elementwise_op.h | 4 +- paddle/fluid/operators/fake_quantize_op.cc | 10 +-- paddle/fluid/operators/fc_op.cc | 10 +-- paddle/fluid/operators/fill_constant_op.cc | 4 +- paddle/fluid/operators/fill_op.cc | 4 +- .../fused/fused_elemwise_activation_op.cc | 10 +-- .../fused/fused_embedding_fc_lstm_op.cc | 3 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 5 +- .../fluid/operators/fused/fusion_lstm_op.cc | 5 +- .../fused/fusion_seqconv_eltadd_relu_op.cc | 5 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 5 +- paddle/fluid/operators/gather_op.cc | 10 +-- paddle/fluid/operators/grid_sampler_op.cc | 12 +-- paddle/fluid/operators/group_norm_op.cc | 3 +- .../operators/hierarchical_sigmoid_op.cc | 10 +-- paddle/fluid/operators/interpolate_op.cc | 8 +- paddle/fluid/operators/is_empty_op.cc | 3 +- paddle/fluid/operators/isfinite_op.cc | 5 +- paddle/fluid/operators/layer_norm_op.cc | 3 +- paddle/fluid/operators/linear_chain_crf_op.cc | 9 +-- paddle/fluid/operators/load_combine_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- paddle/fluid/operators/lod_reset_op.cc | 10 +-- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- .../fluid/operators/lookup_sparse_table_op.cc | 3 +- paddle/fluid/operators/lrn_op.cc | 5 +- paddle/fluid/operators/lstm_op.cc | 6 +- paddle/fluid/operators/lstmp_op.cc | 6 +- paddle/fluid/operators/math/math_function.cc | 6 +- paddle/fluid/operators/math/math_function.cu | 2 +- paddle/fluid/operators/mean_iou_op.cc | 5 +- paddle/fluid/operators/mean_op.cc | 4 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 4 +- paddle/fluid/operators/metrics/accuracy_op.cc | 5 +- paddle/fluid/operators/metrics/auc_op.cc | 5 +- .../operators/metrics/precision_recall_op.cc | 5 +- paddle/fluid/operators/multiplex_op.cc | 10 +-- paddle/fluid/operators/nce_op.cc | 10 +-- .../fluid/operators/optimizers/adadelta_op.cc | 5 +- .../fluid/operators/optimizers/adagrad_op.cc | 5 +- paddle/fluid/operators/optimizers/adam_op.cc | 3 +- .../fluid/operators/optimizers/adamax_op.cc | 5 +- .../optimizers/decayed_adagrad_op.cc | 5 +- paddle/fluid/operators/optimizers/ftrl_op.cc | 3 +- .../optimizers/proximal_adagrad_op.cc | 5 +- .../operators/optimizers/proximal_gd_op.cc | 5 +- paddle/fluid/operators/pad2d_op.cc | 8 +- .../fluid/operators/pad_constant_like_op.cc | 10 +-- paddle/fluid/operators/pool_op.cc | 7 +- paddle/fluid/operators/pool_with_index_op.cc | 10 +-- .../operators/positive_negative_pair_op.cc | 5 +- paddle/fluid/operators/prelu_op.cc | 10 +-- paddle/fluid/operators/print_op.cc | 2 +- paddle/fluid/operators/random_crop_op.cc | 5 +- .../reader/create_batch_reader_op.cc | 4 +- paddle/fluid/operators/recurrent_op.cc | 2 +- paddle/fluid/operators/reshape_op.cc | 14 ++-- .../fluid/operators/rnn_memory_helper_op.cc | 2 +- paddle/fluid/operators/roi_align_op.cc | 10 +-- paddle/fluid/operators/roi_pool_op.cc | 10 +-- paddle/fluid/operators/save_combine_op.cc | 2 +- paddle/fluid/operators/save_op.cc | 2 +- paddle/fluid/operators/scatter_op.cc | 10 +-- .../sequence_ops/sequence_pool_op.cc | 5 +- .../sequence_ops/sequence_scatter_op.cc | 10 +-- .../sequence_ops/sequence_slice_op.cc | 10 +-- .../sequence_ops/sequence_softmax_op.cc | 4 +- paddle/fluid/operators/similarity_focus_op.cc | 5 +- paddle/fluid/operators/slice_op.cc | 5 +- paddle/fluid/operators/softmax_op.cc | 7 +- .../softmax_with_cross_entropy_op.cc | 8 +- paddle/fluid/operators/sum_op.cc | 13 ++-- .../operators/tensorrt/tensorrt_engine_op.h | 5 +- paddle/fluid/operators/transpose_op.cc | 9 +-- paddle/fluid/operators/unpool_op.cc | 10 +-- paddle/fluid/operators/warpctc_op.cc | 10 +-- paddle/fluid/operators/yolov3_loss_op.cc | 10 +-- paddle/fluid/platform/nccl_helper.h | 11 +-- paddle/fluid/pybind/pybind.cc | 2 +- paddle/fluid/pybind/tensor_py.h | 2 +- 132 files changed, 407 insertions(+), 576 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 5467f6d1b23..72c50518af0 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -85,7 +85,7 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->mutable_data(expected_kernel_type.place_, in.type()); framework::VisitDataType( - framework::ToDataType(in.type()), + in.type(), CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); out->set_layout(expected_kernel_type.data_layout_); @@ -101,7 +101,7 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { case mkldnn::memory::data_type::f32: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s8: - return platform::to_void_cast(tensor.data()); + return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::u8: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s16: @@ -144,7 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, memory::data_type in_type = ToMKLDNNDataType(in.type()); PADDLE_ENFORCE(in_type != memory::data_type::data_undef, - "Input tensor type is not supported: ", in.type().name()); + "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 90bb206ec6b..2479de4fd46 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -50,14 +50,14 @@ inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { } } -inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { - static const std::map dict{ - {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT - {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT - {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, - {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, - {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; - auto iter = dict.find(type); +inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { + static std::unordered_map dict{ + {DataTypeTrait::DataType, MKLDNNDataType::f32}, + {DataTypeTrait::DataType, MKLDNNDataType::s8}, + {DataTypeTrait::DataType, MKLDNNDataType::u8}, + {DataTypeTrait::DataType, MKLDNNDataType::s16}, + {DataTypeTrait::DataType, MKLDNNDataType::s32}}; + auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; return MKLDNNDataType::data_undef; } diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 28f3da88fa1..a0248cf3c75 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -26,7 +26,7 @@ struct DataTypeMap { std::unordered_map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; + std::unordered_map proto_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -45,7 +45,7 @@ static inline void RegisterType(DataTypeMap* map, map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); map->cpp_to_proto_.emplace(typeid(T), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T), sizeof(T)); + map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -54,17 +54,7 @@ static DataTypeMap* InitDataTypeMap() { #define RegType(cc_type, proto_type) \ RegisterType(retv, proto_type, #cc_type) - // NOTE: Add your customize type here. - RegType(float16, proto::VarType::FP16); - RegType(float, proto::VarType::FP32); - RegType(double, proto::VarType::FP64); - RegType(int, proto::VarType::INT32); - RegType(int64_t, proto::VarType::INT64); - RegType(bool, proto::VarType::BOOL); - RegType(size_t, proto::VarType::SIZE_T); - RegType(int16_t, proto::VarType::INT16); - RegType(uint8_t, proto::VarType::UINT8); - RegType(int8_t, proto::VarType::INT8); + _ForEachDataType_(RegType); #undef RegType return retv; @@ -96,12 +86,12 @@ std::string DataTypeToString(const proto::VarType::Type type) { static_cast(type)); } -size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type); - if (it != gDataTypeMap().cpp_to_size_.end()) { +size_t SizeOfType(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_size_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", type.name()); + PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type)); } } // namespace framework diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index d5be43b33ed..76df78ea5e1 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -22,46 +22,59 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct DataTypeTrait {}; + +// Stub handle for void +template <> +struct DataTypeTrait { + constexpr static auto DataType = proto::VarType::RAW; +}; + +#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ + callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); + +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8) + +#define DefineDataTypeTrait(cpp_type, proto_type) \ + template <> \ + struct DataTypeTrait { \ + constexpr static auto DataType = proto_type; \ + } + +_ForEachDataType_(DefineDataTypeTrait); + +#undef DefineDataTypeTrait + extern proto::VarType::Type ToDataType(std::type_index type); extern std::type_index ToTypeIndex(proto::VarType::Type type); template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { - switch (type) { - case proto::VarType::FP16: - visitor.template apply(); - break; - case proto::VarType::FP32: - visitor.template apply(); - break; - case proto::VarType::FP64: - visitor.template apply(); - break; - case proto::VarType::INT32: - visitor.template apply(); - break; - case proto::VarType::INT64: - visitor.template apply(); - break; - case proto::VarType::BOOL: - visitor.template apply(); - break; - case proto::VarType::UINT8: - visitor.template apply(); - break; - case proto::VarType::INT16: - visitor.template apply(); - break; - case proto::VarType::INT8: - visitor.template apply(); - break; - default: - PADDLE_THROW("Not supported %d", type); - } +#define VisitDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(VisitDataTypeCallback); +#undef VisitDataTypeCallback + PADDLE_THROW("Not supported %d", type); } extern std::string DataTypeToString(const proto::VarType::Type type); -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); inline std::ostream& operator<<(std::ostream& out, const proto::VarType::Type& type) { out << DataTypeToString(type); diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 54c41c55ba6..92639dfc611 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -26,13 +26,13 @@ TEST(DataType, float16) { Tensor tensor; CPUPlace cpu; - tensor.mutable_data(cpu, f::ToTypeIndex(dtype)); + tensor.mutable_data(cpu, dtype); // test fp16 tensor - EXPECT_EQ(tensor.type(), std::type_index(typeid(float16))); + EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16))); // test fp16 size - EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u); + EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info std::string type = "float16"; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160e..9eaff1f5601 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h index 3f360c510a4..b40b01df364 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -33,7 +33,7 @@ struct FuseVarsOpHandle : public OpHandleBase { FuseVarsOpHandle(ir::Node *node, Scope *local_scope, const platform::Place &place, const std::unordered_map &inputs_numel, - const std::type_index &var_type) + const proto::VarType::Type var_type) : OpHandleBase(node), local_scope_(local_scope), place_(place), @@ -57,7 +57,7 @@ struct FuseVarsOpHandle : public OpHandleBase { Scope *local_scope_; const platform::Place place_; const std::unordered_map inputs_numel_; - const std::type_index type_; + const proto::VarType::Type type_; int64_t total_numel_; }; } // namespace details diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index cb864848b93..85d8abc9100 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -246,7 +246,7 @@ void ReduceOpHandle::RunImpl() { if (!FLAGS_cpu_deterministic) { ReduceLoDTensor func(lod_tensors, out_var->GetMutable()); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); } else { // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. @@ -256,7 +256,7 @@ void ReduceOpHandle::RunImpl() { ->FindVar(out_var_handle->name_) ->GetMutable(); ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); auto trg = out_var->GetMutable(); if (reduce_sum_trg.data() != trg->data()) { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 04e3f78afe4..eaef093ed3b 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/dlpack_tensor.h" - +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace framework { @@ -36,26 +36,23 @@ static ::DLDataType GetDLDataTypeCode() { return dtype; } -static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { -#define REG_DL_DATA_TYPE(type) \ - { std::type_index(typeid(type)), GetDLDataTypeCode() } - static const std::unordered_map - type_to_dtype_map({ - REG_DL_DATA_TYPE(platform::float16), // NOLINT - REG_DL_DATA_TYPE(float), // NOLINT - REG_DL_DATA_TYPE(double), // NOLINT - REG_DL_DATA_TYPE(int), // NOLINT - REG_DL_DATA_TYPE(int64_t), // NOLINT - REG_DL_DATA_TYPE(bool), // NOLINT - REG_DL_DATA_TYPE(size_t), // NOLINT - REG_DL_DATA_TYPE(int16_t), // NOLINT - REG_DL_DATA_TYPE(uint8_t), // NOLINT - REG_DL_DATA_TYPE(int8_t) // NOLINT - }); +static std::unordered_map CreateDLDataTypeMap() { + static std::unordered_map result; + +#define REG_DL_DATA_TYPE(cpp_type, proto_type) \ + result[static_cast(proto_type)] = GetDLDataTypeCode() + + _ForEachDataType_(REG_DL_DATA_TYPE); +#undef REG_DL_DATA_TYPE + return result; +} + +static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { + static auto type_to_dtype_map = CreateDLDataTypeMap(); static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); - auto it = type_to_dtype_map.find(type); - PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", - type.name()); + auto it = type_to_dtype_map.find(static_cast(type)); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d", + type); return it->second; #undef REG_DL_DATA_TYPE } diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 938b0563500..c0a8e1bcdfa 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -91,23 +91,11 @@ void TestMainLoop() { } } } +TEST(dlpack, test_all) { +#define TestCallback(cpp_type, proto_type) TestMainLoop() -#define PADDLE_DLPACK_TEST(type) \ - TEST(dlpack, test_##type) { TestMainLoop(); } - -using float16 = platform::float16; -PADDLE_DLPACK_TEST(float16); -PADDLE_DLPACK_TEST(float); -PADDLE_DLPACK_TEST(double); -PADDLE_DLPACK_TEST(int); -PADDLE_DLPACK_TEST(int64_t); -PADDLE_DLPACK_TEST(bool); -PADDLE_DLPACK_TEST(size_t); -PADDLE_DLPACK_TEST(int16_t); -PADDLE_DLPACK_TEST(uint8_t); -PADDLE_DLPACK_TEST(int8_t); - -#undef PADDLE_DLPACK_TEST + _ForEachDataType_(TestCallback); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 3d535116155..f03f39dfc6d 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -138,39 +138,19 @@ void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { std::cout << sstream.str() << std::endl; } -void print_fetch_var(Scope* scope, std::string var_name) { - const LoDTensor& tensor = scope->FindVar(var_name)->Get(); - - if (std::type_index(tensor.type()) == - std::type_index(typeid(platform::float16))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(double))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int64_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(uint8_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int16_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int8_t))) { - print_lod_tensor(var_name, tensor); - } else { - VLOG(1) << "print_fetch_var: unrecognized data type:" - << tensor.type().name(); - } - - return; +static void print_fetch_var(Scope* scope, const std::string& var_name) { + auto& tensor = scope->FindVar(var_name)->Get(); + +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + print_lod_tensor(var_name, tensor); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(PrintLoDTensorCallback); + VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } void ExecutorThreadWorker::TrainFiles() { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 9b2eeaf59a5..6c8bec32de2 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -70,9 +70,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - if (IsType(t.type())) { + if (t.type() == proto::VarType::FP32) { os << t.data()[i] << " "; - } else if (IsType(t.type())) { + } else if (t.type() == proto::VarType::INT64) { os << t.data()[i] << " "; } else { PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); @@ -387,7 +387,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE(!lod_tensors.empty()); framework::DDim new_dim = lod_tensors[0]->dims(); - std::type_index new_type = lod_tensors[0]->type(); + auto new_type = lod_tensors[0]->type(); framework::DataLayout new_layout = lod_tensors[0]->layout(); LoD new_lod = lod_tensors[0]->lod(); for (size_t i = 1; i < lod_tensors.size(); ++i) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7..05ab48412a8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -43,10 +43,9 @@ std::vector> kKernelPriority = { proto::VarType::Type GetDataTypeOfVar(const Variable* var) { if (var->IsType()) { - return framework::ToDataType(var->Get().type()); + return var->Get().type(); } else if (var->IsType()) { - return framework::ToDataType( - var->Get().value().type()); + return var->Get().value().type(); } else { PADDLE_THROW("Var should be LoDTensor or SelectedRows"); } @@ -93,13 +92,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { if (UNLIKELY(!tensor.IsInitialized())) { return ""; } - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } else if (var->IsType()) { auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } } else { return ""; @@ -686,7 +685,8 @@ static void CheckTensorNANOrInf(const std::string& name, if (tensor.memory_size() == 0) { return; } - if (!IsType(tensor.type()) && !IsType(tensor.type())) { + if (tensor.type() != proto::VarType::FP32 && + tensor.type() != proto::VarType::FP64) { return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), @@ -879,7 +879,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - int tmp = static_cast(ToDataType(t->type())); + int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 62a30815d4f..54a818250b4 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -218,11 +218,11 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, if (index < 0) { VLOG(5) << "id " << id << " not in the table, return 0"; framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorFillVisitor(value, i * value_width, value_width, 0.0)); } else { framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorCopyVisitor(value, i * value_width, *value_.get(), index * value_width, value_width)); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 41566800e57..57335847a19 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace framework { -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); @@ -31,7 +31,7 @@ size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; } -void* Tensor::mutable_data(platform::Place place, std::type_index type, +void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr, size_t requested_size) { type_ = type; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b6..057fe1f98cd 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -67,7 +68,7 @@ class Tensor { friend struct EigenVector; public: - Tensor() : type_(typeid(float)), offset_(0) {} + Tensor() : type_(proto::VarType::FP32), offset_(0) {} /*! Return a pointer to mutable memory block. */ template @@ -88,7 +89,7 @@ class Tensor { memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); - void* mutable_data(platform::Place place, std::type_index type, + void* mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); @@ -138,7 +139,7 @@ class Tensor { return holder_->place(); } - std::type_index type() const { + proto::VarType::Type type() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor not initialized yet when Tensor::type() is called."); return type_; @@ -161,7 +162,7 @@ class Tensor { private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; - std::type_index type_; + proto::VarType::Type type_; /** * @brief points to elements dimensions. * diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 0c9c0d782fc..ce3ad18b1fb 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -24,9 +24,8 @@ template inline const T* Tensor::data() const { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); @@ -38,9 +37,8 @@ template inline T* Tensor::data() { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", type_); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } @@ -60,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); return reinterpret_cast( - mutable_data(place, typeid(T), attr, requested_size)); + mutable_data(place, DataTypeTrait::DataType, attr, requested_size)); } inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ca1e01c89f0..85d15c5d3fa 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -186,8 +186,8 @@ struct AnyDTypeVisitor { template inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, const DevCtx& ctx, framework::Tensor* out) { - VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( - predicate, tensor, ctx, out)); + VisitDataType(tensor.type(), AnyDTypeVisitor( + predicate, tensor, ctx, out)); } template @@ -379,7 +379,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, // int32_t size // void* protobuf message proto::VarType::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); + desc.set_data_type(tensor.type()); auto dims = framework::vectorize(tensor.dims()); auto* pb_dims = desc.mutable_dims(); pb_dims->Resize(static_cast(dims.size()), 0); @@ -461,9 +461,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, tensor->Resize(framework::make_ddim(dims)); void* buf; auto ctx = platform::CPUDeviceContext(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); + size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA Tensor cpu_tensor; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index be51e7fc1f0..c751e851582 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -289,10 +289,10 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 4c5b412a2c1..3d121e04600 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -266,10 +266,10 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 014bdc6a379..191225493c3 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -36,10 +36,10 @@ namespace paddle { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor pt; - if (t->type() == typeid(int64_t)) { + if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == typeid(float)) { + } else if (t->type() == framework::proto::VarType::INT32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 6f7da445fc8..1de59a5165c 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -78,7 +78,7 @@ class AffineGridOp : public framework::OperatorWithKernel { library = framework::LibraryType::kCUDNN; } #endif - auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + auto data_type = ctx.Input("Theta")->type(); return framework::OpKernelType(data_type, ctx.GetPlace(), framework::DataLayout::kAnyLayout, library); } @@ -188,9 +188,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Theta")->type()), - ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("Theta")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 8174d373585..7fe9a0df746 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu index a147d77a9e9..85e4f981735 100644 --- a/paddle/fluid/operators/arg_max_op.cu +++ b/paddle/fluid/operators/arg_max_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 41f188029f1..23b24735cd0 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu index 4d020508505..47d7c8b1224 100644 --- a/paddle/fluid/operators/arg_min_op.cu +++ b/paddle/fluid/operators/arg_min_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 6257e04b010..d942391b864 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -58,7 +58,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor { ArrayToLoDFunctorImpl functor; functor.dev_ctx_ = dev_ctx; functor.prev_functor_ = this; - framework::VisitDataType(framework::ToDataType(out->type()), functor); + framework::VisitDataType(out->type(), functor); } }; @@ -91,7 +91,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); int rank = x[0].dims().size(); platform::Place place = x[0].place(); - std::type_index data_type = x[0].type(); + auto data_type = x[0].type(); int64_t batch_size = x[0].dims()[0]; framework::DDim ins_dims = rank > 1 ? framework::slice_ddim(x[0].dims(), 1, rank) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 75fc59125f2..b6996be4b09 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -121,9 +121,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void AttentionLSTMOpMaker::Make() { diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index f389eab605e..0922b03b5f5 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -103,9 +103,8 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("param")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f66813989c6..8b672e09b2c 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -72,8 +72,7 @@ class BatchNormOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); // By default, the type of the scale, bias, mean, // and var tensors should both be float. (For float or float16 input tensor) // or double (For double input tensor). @@ -81,17 +80,13 @@ class BatchNormOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP64) { bn_param_type = framework::proto::VarType::FP64; } - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Scale")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), "Scale input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Bias")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), "Bias input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Mean")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), "Mean input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( - ctx.Input("Variance")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), "Variance input should be of float type"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready @@ -413,9 +408,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout, library); } }; diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 0d32cae0e1e..ae9765b7613 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -145,7 +145,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase { LoDTensor* sentenceScores = ctx.Output("SentenceScores"); framework::VisitDataType( - framework::ToDataType(scores->at(0).type()), + scores->at(0).type(), BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id)); } diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 62771d09f11..30f700f1d91 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -282,8 +282,7 @@ class BeamSearchOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType( - ctx.Input("pre_ids")->type()), + ctx.Input("pre_ids")->type(), platform::CPUPlace()); return kt; } diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 9258d7c7e83..f349c51d8a9 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -47,9 +47,8 @@ class BprLossOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -94,9 +93,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 135254ce6b6..dd28f82b654 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -48,13 +48,12 @@ class ConditionalOp : public framework::OperatorBase { if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { PADDLE_THROW("should have one initialized input as condition"); } - if (!(framework::IsType(ips[0]->type()) && // NOLINT - ips[0]->numel() == 1)) { - PADDLE_THROW( - "condition input's data type should be bool, " - "numel should be 1, actual numel is %d", - ips[0]->numel()); - } + + PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL && + ips[0]->numel() == 1, + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 6c1b2f329a5..66f8508f029 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -237,7 +237,7 @@ class WhileGradOp : public framework::OperatorBase { if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["dtype"] = inside_tensor.type(); attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d7b87662885..183850db18c 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -95,10 +95,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("Input")->type()); - auto filter_data_type = - framework::ToDataType(ctx.Input("Filter")->type()); + auto input_data_type = ctx.Input("Input")->type(); + auto filter_data_type = ctx.Input("Filter")->type(); PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, "input and filter data type should be consistent"); @@ -382,9 +380,9 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_, customized_type_value); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); } } // namespace operators diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 2fdfc40d194..86a140f1521 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -104,9 +104,8 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } void Conv2DTransposeOpMaker::Make() { @@ -335,9 +334,8 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } } // namespace operators diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index c27befe1143..81c9e9e5431 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -118,9 +118,8 @@ class CRFDecodingOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Emission")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); } }; } // namespace operators diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index a2a871efa85..97d20681b81 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -51,9 +51,8 @@ class CropOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -174,9 +173,7 @@ class CropOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a904dd91302..1968e54b006 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -57,9 +57,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -111,9 +110,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index d2b440d9d2e..e7c472f8c0c 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -36,9 +36,8 @@ class CTCAlignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc index 0c0155a0a97..f2984d1af2f 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cc +++ b/paddle/fluid/operators/detection/anchor_generator_op.cc @@ -53,8 +53,7 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc index c23b65fe4de..b7da1261a8f 100644 --- a/paddle/fluid/operators/detection/bipartite_match_op.cc +++ b/paddle/fluid/operators/detection/bipartite_match_op.cc @@ -45,9 +45,8 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("DistMat")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("DistMat")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 1012ba3652d..cacd47ed4a8 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -66,8 +66,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + ctx.Input("Input")->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 709c2dfc4b7..2c46803fd00 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -66,9 +66,8 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Anchors")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Anchors")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc index 54a4b87ec8f..f70e6adb5b4 100644 --- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -249,8 +249,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("ClsLoss")->type()), - platform::CPUPlace()); + ctx.Input("ClsLoss")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index f0f8851be0e..2395b181485 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -65,8 +65,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Scores")->type()), + ctx.Input("Scores")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index b5cb6a724c0..3e75c0394f9 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -72,8 +72,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 42c720e701f..3796854fe67 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -498,9 +498,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -519,9 +518,8 @@ class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 46fff9d338b..dc6c3d5a668 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -78,8 +78,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Anchor")->type()), + ctx.Input("Anchor")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc index 36700193925..c057c82ce0f 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cc +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -57,9 +57,8 @@ class TargetAssignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index d7f49a9590e..e1d113f8542 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -71,8 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("DetectRes")->type()), + ctx.Input("DetectRes")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156..41644d8cc17 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -197,8 +197,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = framework::ToDataType( - ctx.Input(framework::GradVarName("Out"))->type()); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); #ifdef PADDLE_WITH_MKLDNN if (platform::CanMKLDNNBeUsed(ctx)) { diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 43af83fd693..8aff9111412 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -115,9 +115,8 @@ class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -175,9 +174,8 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index e80249fc878..1ed8a2ddd1e 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -79,9 +79,8 @@ framework::OpKernelType FCOp::GetExpectedKernelType( library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout, library); } void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { @@ -111,9 +110,8 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout, library); } void FCOpMaker::Make() { diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 252f3134402..38cb33e7904 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -59,9 +59,9 @@ class FillConstantOp : public framework::OperatorBase { if (force_cpu) { auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, framework::ToTypeIndex(data_type)); + tensor->mutable_data(cpu, data_type); } else { - tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type)); + tensor->mutable_data(dev_place, data_type); } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc index adc7cb1f9e4..a885b301e77 100644 --- a/paddle/fluid/operators/fill_op.cc +++ b/paddle/fluid/operators/fill_op.cc @@ -55,7 +55,7 @@ class FillOp : public framework::OperatorBase { static_cast(Attr("dtype")); platform::CPUPlace cpu; auto force_cpu = Attr("force_cpu"); - out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype)); + out.mutable_data(force_cpu ? cpu : place, dtype); framework::LoDTensor tensor; @@ -64,7 +64,7 @@ class FillOp : public framework::OperatorBase { } else { // Always make tensor in CPU memory. tensor.Resize(out.dims()); - tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + tensor.mutable_data(cpu, dtype); } framework::VisitDataType( diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc index 3771aac0dfd..0fbf564b7ef 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc @@ -135,9 +135,8 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx.Input("X")->type(), ctx.Input("Y")->type(), "The element's type of input should be the same."); - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -324,9 +323,8 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type_index = ctx.Input("Y")->type(); - auto input_data_type = framework::ToDataType(input_data_type_index); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.GetPlace()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 1eb6523a2df..f1466f17fec 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -115,8 +115,7 @@ void FusedEmbeddingFCLSTMOp::InferShape( framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Embeddings")->type()), + ctx.Input("Embeddings")->type(), ctx.device_context()); } diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 25b7ae7c282..4ce67e16dd0 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -93,9 +93,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionGRUOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionGRUOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 8021a896cea..c4e752e3f0c 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -117,9 +117,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionLSTMOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index 40bba09f3ef..b05329cfd07 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -61,9 +61,8 @@ void FusionSeqConvEltAddReluOp::InferShape( framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionSeqConvEltAddReluOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 17ed9771d07..aaef46de0d3 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -67,9 +67,8 @@ void FusionSeqExpandConcatFCOp::InferShape( framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } void FusionSeqExpandConcatFCOpMaker::Make() { diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 95aa9b573c7..0a8c0814a7d 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -42,9 +42,8 @@ class GatherOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -60,9 +59,8 @@ class GatherGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index e76eb6893b1..14a2524bd8f 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -63,9 +63,9 @@ class GridSampleOp : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; @@ -159,9 +159,9 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 6322659b67f..4fa15058f86 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -141,8 +141,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + return framework::OpKernelType(t->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 0dbcc442dfa..a8071171157 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -76,9 +76,8 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -163,9 +162,8 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 4d25822259a..93dd3f794f6 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -55,8 +55,8 @@ class InterpolateOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -124,8 +124,8 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 29b73951bbd..ba50bdf34ba 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -35,8 +35,7 @@ class IsEmptyOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + ctx.Input("X")->type(), platform::CPUPlace()); return kt; } }; diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 7b42efd623b..1312eecfa44 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -40,10 +40,9 @@ class OverflowOp : public framework::OperatorWithKernel { int dtype = -1; auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { - dtype = framework::ToDataType(x_var->Get().type()); + dtype = x_var->Get().type(); } else if (x_var->IsType()) { - dtype = framework::ToDataType( - x_var->Get().value().type()); + dtype = x_var->Get().value().type(); } else { PADDLE_THROW("Cannot find the input data type by all input data"); } diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index 14ce1da2e97..f83fe355b85 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -153,8 +153,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + return framework::OpKernelType(t->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index ea1ca7f59db..998b7f09c31 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -184,9 +184,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { // is determined by its input "Emission". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Emission")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); } }; @@ -244,9 +243,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("LogLikelihood")) - ->type()), + ctx.Input(framework::GradVarName("LogLikelihood"))->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 9d1423915af..e28d199eebc 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase { // Get data from fin to tensor DeserializeFromStream(*buffer, tensor, dev_ctx); - auto in_dtype = framework::ToDataType(tensor->type()); + auto in_dtype = tensor->type(); auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index df1edc5c2e9..06773d1d0ed 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -65,7 +65,7 @@ class LoadOp : public framework::OperatorBase { DeserializeFromStream(fin, tensor, dev_ctx); auto load_as_fp16 = Attr("load_as_fp16"); - auto in_dtype = framework::ToDataType(tensor->type()); + auto in_dtype = tensor->type(); auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; if (in_dtype != out_dtype) { diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 0d4e84e8508..7c8fe5fbd76 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -39,9 +39,8 @@ class LoDResetOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -144,9 +143,8 @@ class LoDResetGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 145d2db118f..9b91cf52601 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -72,7 +72,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor { LoDTensorToArrayFunctorImpl func; func.prev_functor_ = this; func.dev_ctx_ = dev_ctx; - framework::VisitDataType(framework::ToDataType(input_.type()), func); + framework::VisitDataType(input_.type(), func); } }; diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc index 1b55527fd33..4840a7ac1e7 100644 --- a/paddle/fluid/operators/lookup_sparse_table_op.cc +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -63,8 +63,7 @@ class LookupSparseTableOp : public framework::OperatorBase { out_shape[0] = ids_t.numel(); out_t->Resize(out_shape); out_t->mutable_data(cpu, w_t->value().type()); - PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), - framework::proto::VarType::FP32, + PADDLE_ENFORCE_EQ(w_t->value().type(), framework::proto::VarType::FP32, "The sparse table only support FP32"); w_t->Get(ids_t, out_t, true, is_test); out_t->set_lod(ids_t.lod()); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index a3bb2be5c7a..06ac31b5f19 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -145,9 +145,8 @@ framework::OpKernelType GetExpectedLRNKernel( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout_, library_); } } // namespace diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 3225bf9bb63..4a199d681f3 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -96,8 +96,7 @@ class LSTMOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; @@ -261,8 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc index e398b51480f..7a62bc9f828 100644 --- a/paddle/fluid/operators/lstmp_op.cc +++ b/paddle/fluid/operators/lstmp_op.cc @@ -113,8 +113,7 @@ class LSTMPOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; @@ -312,8 +311,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 854c8653ff5..e1491a8156c 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -77,16 +77,14 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); } template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); } struct TensorSetConstantWithPlace : public boost::static_visitor { diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 9372d63f0be..4645b3ae6e6 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -65,7 +65,7 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), + framework::VisitDataType(tensor->type(), TensorSetConstantGPU(context, tensor, value)); } diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc index a60f245f53e..bb290046f3a 100644 --- a/paddle/fluid/operators/mean_iou_op.cc +++ b/paddle/fluid/operators/mean_iou_op.cc @@ -44,9 +44,8 @@ class MeanIoUOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Predictions")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Predictions")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 820636defad..35b6d7b5e3b 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -61,9 +61,7 @@ class MeanGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); - + auto input_data_type = ctx.Input("X")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 2dc1467b0d4..da7fa1b81d6 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -63,9 +63,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { platform::Place place = dev_place; int64_t batch_size = in_true.dims()[0] + in_false.dims()[0]; - - std::type_index data_type = - in_true.IsInitialized() ? in_true.type() : in_false.type(); + auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type(); int rank; framework::DDim in_dims; if (in_true.IsInitialized()) { diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 95aa76bc694..7db6dff2971 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -55,9 +55,8 @@ class AccuracyOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Out")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Out")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 335d4fded4a..5e33dd96064 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -51,9 +51,8 @@ class AucOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Predict")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Predict")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc index 0d733c47dd2..1a67b134914 100644 --- a/paddle/fluid/operators/metrics/precision_recall_op.cc +++ b/paddle/fluid/operators/metrics/precision_recall_op.cc @@ -82,9 +82,8 @@ class PrecisionRecallOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("MaxProbs")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("MaxProbs")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 18ad46cb5ee..1801f2915e0 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -53,9 +53,8 @@ class MultiplexOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } }; @@ -123,9 +122,8 @@ class MultiplexGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821dd..06c35c789f8 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -69,9 +69,8 @@ class NCEOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + platform::CPUPlace()); } }; @@ -214,9 +213,8 @@ class NCEOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index 9039d02b673..dd365629fcc 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -70,9 +70,8 @@ class AdadeltaOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index e8d5a9e2c87..bd1bb98e638 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -59,9 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39ac..5eae503461f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -75,8 +75,7 @@ class AdamOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + auto input_data_type = ctx.Input("Param")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index 4b244a76dc0..aef1fc972c0 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -76,9 +76,8 @@ class AdamaxOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc index 80278441c07..07899278f9e 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc @@ -64,9 +64,8 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc index 1c9e91d9b61..c1a4f5790bf 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cc +++ b/paddle/fluid/operators/optimizers/ftrl_op.cc @@ -66,8 +66,7 @@ class FTRLOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + auto input_data_type = ctx.Input("Param")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc index 7b07b3b7071..9dd9b8afbd4 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc @@ -58,9 +58,8 @@ class ProximalAdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc index dcef4f7be24..fccfc2b4584 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc @@ -46,9 +46,8 @@ class ProximalGDOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index a9da21f4790..6ef2dacb386 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -511,8 +511,8 @@ class Pad2dOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -612,8 +612,8 @@ class Pad2dOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 685ebc39379..3f827c26fd4 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -47,9 +47,8 @@ class PadConstantLikeOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Y")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.device_context()); } }; @@ -171,9 +170,8 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Y")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 52b607df744..62599548496 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -99,9 +99,8 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout_, library_); } void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { @@ -130,7 +129,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( } #endif - auto input_data_type = framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index 873706593e4..179ee96e01c 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -71,9 +71,8 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -92,9 +91,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc index 4d865b7f17b..99256e408d4 100644 --- a/paddle/fluid/operators/positive_negative_pair_op.cc +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -87,9 +87,8 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Score")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Score")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 64d94ab6044..62c55c4f557 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,9 +56,8 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -113,9 +112,8 @@ class PReluGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index e7f1caf4d3a..6a5bf170600 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -172,7 +172,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.name = printed_var_name; } if (Attr("print_tensor_type")) { - formater.dtype = printed_tensor.type(); + formater.dtype = framework::ToTypeIndex(printed_tensor.type()); } if (Attr("print_tensor_shape")) { auto &dims = printed_tensor.dims(); diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index 123fa44fa3d..cd3bd32adb4 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -22,9 +22,8 @@ class RandomCropOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index e17c2ffd39e..f771cebd0cc 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -99,10 +99,10 @@ void BatchReader::ReadNextImpl(std::vector* out) { out->reserve(out_num); for (size_t j = 0; j < out_num; ++j) { // Merge shape and check date type - std::type_index batch_type = buffer_[0][j].type(); + auto batch_type = buffer_[0][j].type(); framework::DDim batch_shape = buffer_[0][j].dims(); for (size_t i = 1; i < buffer_.size(); ++i) { - std::type_index ins_type = buffer_[i][j].type(); + auto ins_type = buffer_[i][j].type(); framework::DDim ins_shape = buffer_[i][j].dims(); PADDLE_ENFORCE_EQ(batch_type, ins_type); PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 162bfcbb084..a1e02a3fd0e 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -414,7 +414,7 @@ class RecurrentGradOp : public RecurrentBase { auto &inside_tensor = cur_scope.FindVar(inside_grad_name) ->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["dtype"] = inside_tensor.type(); attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 500d86fec33..289d848ea18 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -108,9 +108,8 @@ class ReshapeOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -189,9 +188,8 @@ class ReshapeGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -322,9 +320,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 0fb7776fd9d..834dd1eabd6 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -99,7 +99,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { auto &in_var_tensor = in_var->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(in_var_tensor.type()); + attrs["dtype"] = in_var_tensor.type(); attrs["shape"] = framework::vectorize2int(in_var_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 79f189222ef..6857b5ed9db 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -62,9 +62,8 @@ class ROIAlignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -83,9 +82,8 @@ class ROIAlignGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 3f6b2e46c70..e46d92d6fc3 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -69,9 +69,8 @@ class ROIPoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -90,9 +89,8 @@ class ROIPoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 5b05f757c03..a0b9fa305d8 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -75,7 +75,7 @@ class SaveCombineOp : public framework::OperatorBase { // Serialize tensors one by one // Check types to see if a fp16 transformation is required - auto in_dtype = framework::ToDataType(tensor.type()); + auto in_dtype = tensor.type(); auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e79cffcf498..e1c9fd8ff1f 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -85,7 +85,7 @@ class SaveOp : public framework::OperatorBase { filename); auto save_as_fp16 = Attr("save_as_fp16"); - auto in_dtype = framework::ToDataType(tensor.type()); + auto in_dtype = tensor.type(); auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; if (in_dtype != out_dtype) { diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index c32d2603cf7..ad418d51bcd 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -51,9 +51,8 @@ class ScatterOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -70,9 +69,8 @@ class ScatterGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 44b09bf7c2c..1754221e771 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -114,9 +114,8 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index c49d1ccb184..8267c04f9f2 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -112,9 +112,8 @@ class SequenceScatterOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -131,9 +130,8 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index 6f84023e26d..35f49f78ced 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -50,9 +50,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -71,9 +70,8 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 644a5bebc18..027073e5d7d 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -51,7 +51,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { } std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + ctx.Input("X")->type(), ctx.GetPlace(), framework::StringToDataLayout(data_format), library_); } }; @@ -146,7 +146,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { } std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + ctx.Input("X")->type(), ctx.GetPlace(), framework::StringToDataLayout(data_format), library_); } }; diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc index 9612f82b6d4..21871d76569 100644 --- a/paddle/fluid/operators/similarity_focus_op.cc +++ b/paddle/fluid/operators/similarity_focus_op.cc @@ -70,9 +70,8 @@ class SimilarityFocusOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index e55462d6cfe..789e61b2d33 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -59,9 +59,8 @@ class SliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 091ce4e6e8e..bc889a5a042 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -62,8 +62,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); @@ -169,8 +168,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { layout_ = framework::DataLayout::kMKLDNN; } #endif - auto input_data_type = framework::ToDataType( - ctx.Input(framework::GradVarName("Out"))->type()); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 2900221485e..0397c7791e1 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -131,9 +131,8 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context()); } }; @@ -173,8 +172,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Loss"))->type()), + ctx.Input(framework::GradVarName("Loss"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 7df14158f34..4f717a43551 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -91,9 +91,9 @@ class SumOp : public framework::OperatorWithKernel { continue; } if (dtype == -1) { - dtype = framework::ToDataType(tensor->type()); + dtype = tensor->type(); } else { - PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type())); + PADDLE_ENFORCE_EQ(dtype, tensor->type()); } } PADDLE_ENFORCE_NE(dtype, -1, @@ -106,8 +106,8 @@ class SumOp : public framework::OperatorWithKernel { for (auto& var : x_vars) { auto& value = var->Get().value(); if (value.IsInitialized()) { - return framework::OpKernelType(framework::ToDataType(value.type()), - ctx.device_context(), layout, library); + return framework::OpKernelType(value.type(), ctx.device_context(), + layout, library); } } // if input sparse vars are not initialized, use an default kernel type. @@ -118,9 +118,8 @@ class SumOp : public framework::OperatorWithKernel { auto& array = x_var->Get(); for (auto& each : array) { if (each.numel() != 0) { - return framework::OpKernelType(framework::ToDataType(each.type()), - ctx.device_context(), layout, - library); + return framework::OpKernelType(each.type(), ctx.device_context(), + layout, library); } } } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 6eef4c98c48..5b2aad55a4e 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -76,10 +76,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input0 = ctx.Inputs("Xs").front(); framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.scope() - .FindVar(input0) - ->GetMutable() - ->type()), + ctx.scope().FindVar(input0)->GetMutable()->type(), ctx.GetPlace()); return kt; } diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index bbd71db6062..bc1f59bc1a7 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -144,9 +144,8 @@ class Transpose2Op : public TransposeOp { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -194,9 +193,7 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 6d2ccb38f67..11e505d6df3 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -74,9 +74,8 @@ class UnpoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } public: @@ -113,9 +112,8 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } public: diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index 6a257cebf52..e2ae7caae1e 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -56,9 +56,8 @@ class WarpCTCOp : public framework::OperatorWithKernel { } #endif framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context(), layout_, library_); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context(), layout_, library_); } }; @@ -136,9 +135,8 @@ class WarpCTCGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index e7597f73243..60508f7ab87 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -64,9 +64,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -180,9 +179,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 7c539d25f6d..cbb090adefd 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -20,6 +20,7 @@ #include // NOLINT #include #include +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" @@ -28,14 +29,14 @@ namespace paddle { namespace platform { -inline ncclDataType_t ToNCCLDataType(std::type_index type) { - if (type == typeid(float)) { // NOLINT +inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { + if (type == framework::proto::VarType::FP32) { return ncclFloat; - } else if (type == typeid(double)) { // NOLINT + } else if (type == framework::proto::VarType::FP64) { return ncclDouble; - } else if (type == typeid(int)) { // NOLINT + } else if (type == framework::proto::VarType::INT32) { return ncclInt; - } else if (type == typeid(int64_t)) { // NOLINT + } else if (type == framework::proto::VarType::INT64) { return ncclInt64; } else { PADDLE_THROW("Not supported"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca0c01ab22..314ab986258 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -206,7 +206,7 @@ PYBIND11_MODULE(core, m) { .def("_get_float_element", TensorGetElement) .def("_set_double_element", TensorSetElement) .def("_get_double_element", TensorGetElement) - .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); + .def("_dtype", [](Tensor &self) { return self.type(); }); py::class_(m, "LoDTensor", R"DOC( LoDTensor is a Tensor with optional LoD information. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index f67f40f19f6..5e91f5b3018 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -43,7 +43,7 @@ template struct CastToPyBufferImpl { using CUR_TYPE = typename std::tuple_element>::type; pybind11::buffer_info operator()(const framework::Tensor &tensor) { - if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) { + if (framework::DataTypeTrait::DataType == tensor.type()) { auto dim_vec = framework::vectorize(tensor.dims()); std::vector dims_outside; std::vector strides; -- GitLab From 06f8aa5b97be564b878848acd216069e23081300 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 03:08:21 +0000 Subject: [PATCH 0146/2367] remove while_op support temporarily test=develop --- paddle/fluid/framework/executor.cc | 3 +- .../fluid/operators/controlflow/while_op.cc | 46 +------------------ 2 files changed, 3 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 767bbb524f4..7eab8760159 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -419,7 +419,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - if (max_memory_size >= 0) { + // skip while_op and while_grad_op temporarily + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 06920a47ee0..5ab0918c486 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -365,51 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ - std::unordered_set bwd_skip_vars; - if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set fwd_skip_vars; - for (auto *op_desc : grad_block->AllOps()) { - auto skippable = [&](const std::string &name) { - return !grad_block->HasVar(name) && - (fwd_block->HasVarRecursive(name) || - parent_block->HasVarRecursive(name)); - }; - for (auto &in_arg_name : op_desc->InputArgumentNames()) { - if (skippable(in_arg_name)) { - fwd_skip_vars.insert(in_arg_name); - } - } - - for (auto &out_arg_name : op_desc->OutputArgumentNames()) { - if (skippable(out_arg_name)) { - fwd_skip_vars.insert(out_arg_name); - } - } - } - - if (!fwd_skip_vars.empty()) { - // FIXME(zjl): ugly const_cast here, maybe we should find a better way - // to modify forward while_op - auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr(kSkipEagerDeletionVars, - std::vector(fwd_skip_vars.begin(), - fwd_skip_vars.end())); - } - - // Find backward skip vars - auto fwd_input = Input(kX); - for (size_t i = 0; i < igs.size(); ++i) { - if (igs[i] == framework::kEmptyVarName) { - continue; - } - bwd_skip_vars.insert(igs[i]); - bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); - } - } - while_grad->SetAttr( - kSkipEagerDeletionVars, - std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); + while_grad->SetAttr(kSkipEagerDeletionVars, std::vector()); return std::unique_ptr(while_grad); } -- GitLab From e240ba291853856d29790ecd3b6c5493c5ab2cd3 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 03:16:34 +0000 Subject: [PATCH 0147/2367] implement backward test=develop --- paddle/fluid/framework/op_desc.cc | 2 + paddle/fluid/framework/op_desc.h | 2 + paddle/fluid/framework/operator.cc | 5 + paddle/fluid/framework/shape_inference.h | 5 + paddle/fluid/operators/py_func_op.cc | 127 ++++++++++++++++++++--- paddle/fluid/pybind/protobuf.cc | 2 +- python/paddle/fluid/layers/nn.py | 39 ++++--- 7 files changed, 154 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e8ecd905029..f8a9340df57 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -34,6 +34,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { public: CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); + InferShapeOpPtr GetOp() const override { return &op_; } + bool HasInput(const std::string &name) const override; bool HasOutput(const std::string &name) const override; diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 30c8a26c3d2..3b3f50bfa7f 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -121,6 +121,8 @@ class OpDesc { BlockDesc *Block() { return this->block_; } + const BlockDesc *Block() const { return this->block_; } + private: template static std::vector MapKeys(const MapType &map) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7..188ab120be8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -481,6 +481,8 @@ class RuntimeInferShapeContext : public InferShapeContext { RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) : op_(op), scope_(scope) {} + InferShapeOpPtr GetOp() const override { return &op_; } + bool HasInput(const std::string& name) const override { // has only one input const auto& ins = op_.Inputs(); @@ -879,6 +881,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), + "Input %s(%s) does not exist in Operator %s", + input.first, ipt_name, DebugString()); int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index d73cca121e4..2f95ab353ee 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -25,7 +25,10 @@ limitations under the License. */ namespace paddle { namespace framework { +class OperatorBase; + using InferShapeVarPtr = boost::variant; +using InferShapeOpPtr = boost::variant; class InferShapeContext { public: @@ -38,6 +41,8 @@ class InferShapeContext { std::vector GetOutputsVarType( const std::string &name) const; + virtual InferShapeOpPtr GetOp() const = 0; + virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 86914f30604..46a6125f974 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -24,34 +24,34 @@ namespace operators { namespace py = pybind11; -static std::mutex g_py_callables_mtx; static std::vector g_py_callables; size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { - std::lock_guard guard(g_py_callables_mtx); g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } static py::object *GetPythonCallableObject(size_t i) { - std::lock_guard guard(g_py_callables_mtx); PADDLE_ENFORCE_LT(i, g_py_callables.size()); return &g_py_callables[i]; } -void DoCallPythonFunc(py::object *callable, const std::string &func_token, - const std::vector &ins, - std::vector *out) { +void CallPythonFunc(py::object *callable, const std::string &func_token, + const std::vector &ins, + std::vector *out) { py::gil_scoped_acquire guard{}; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { - in_args[i] = py::cast(ins[i]); + in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); } auto ret = (*callable)(func_token, *in_args); auto ret_tuple = py::cast(ret); PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); for (size_t i = 0; i < out->size(); ++i) { + if ((*out)[i] == nullptr) { + continue; + } try { auto *out_tensor = py::cast(ret_tuple[i]); PADDLE_ENFORCE_NOT_NULL(out_tensor, @@ -67,8 +67,43 @@ void DoCallPythonFunc(py::object *callable, const std::string &func_token, class PyFuncOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(!ctx->IsRuntime(), + "Infer shape cannot be called in runtime."); PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + + auto *op = boost::get(ctx->GetOp()); + auto *block = op->Block(); + // No need to infer shape in forward part + if (block->ForwardBlockID() < 0) { + return; + } + + PADDLE_ENFORCE(!ctx->Attrs().Get("token").empty(), + "Function token cannot be empty"); + + const std::string kGradVarSuffix = framework::kGradVarSuffix; + auto out_vars = ctx->GetOutputVarPtrs("Out"); + for (auto &out_var : out_vars) { + auto *out_var_desc = boost::get(out_var); + auto out_name = out_var_desc->Name(); + if (out_name == framework::kEmptyVarName || + out_name.size() < kGradVarSuffix.size()) { + continue; + } + + size_t len = out_name.size() - kGradVarSuffix.size(); + if (out_name.substr(len) == kGradVarSuffix) { + auto fwd_var_name = out_name.substr(0, len); + auto *in_var_desc = block->FindVarRecursive(fwd_var_name); + PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", + fwd_var_name); + out_var_desc->SetShape(in_var_desc->GetShape()); + out_var_desc->SetDataType(in_var_desc->GetDataType()); + out_var_desc->SetLoDLevel(in_var_desc->GetLoDLevel()); + out_var_desc->SetType(in_var_desc->GetType()); + } + } } }; @@ -77,12 +112,68 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Inputs of py_func op.").AsDuplicable(); AddOutput("Out", "Outputs of py_func op").AsDuplicable(); - AddAttr("token", "function token"); - AddAttr("handle_idx", "handle index").SetDefault(0); + AddAttr("handle_idx", "Index of the registered py_func handle") + .SetDefault(0); + AddAttr("token", "Token of function token to be called") + .SetDefault(""); + AddAttr("backward_token", + "Token of backward function to be called") + .SetDefault(""); AddComment(R"DOC("PyFunc Op")DOC"); } }; +class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + auto &fwd_attrs = Attrs(); + if (fwd_attrs.at("backward_token").empty()) { + return {}; + } + + std::unique_ptr grad_op(new framework::OpDesc()); + grad_op->SetType("py_func"); + + framework::AttributeMap bwd_attrs; + bwd_attrs["token"] = fwd_attrs.at("backward_token"); + bwd_attrs["backward_token"] = std::string(""); + grad_op->SetAttrMap(bwd_attrs); + + auto bwd_in = Input("X"); + auto fwd_out = Output("Out"); + auto fwd_out_grad = OutputGrad("Out"); + bwd_in.insert(bwd_in.end(), fwd_out.begin(), fwd_out.end()); + bwd_in.insert(bwd_in.end(), fwd_out_grad.begin(), fwd_out_grad.end()); + + auto bwd_out = InputGrad("X", false); + + if (VLOG_IS_ON(10)) { + std::string in_str = "PyFunc Grad Input: "; + for (auto &in : bwd_in) { + in_str += in; + in_str += " "; + } + VLOG(10) << in_str; + + std::string out_str = "PyFunc Grad Output: "; + for (auto &out : bwd_out) { + out_str += out; + out += " "; + } + VLOG(10) << out_str; + } + + grad_op->SetInput("X", bwd_in); + grad_op->SetOutput("Out", InputGrad("X", false)); + + std::vector> ret(1); + ret[0] = std::move(grad_op); + return ret; + } +}; + class PyFuncOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; @@ -95,8 +186,14 @@ class PyFuncOp : public framework::OperatorBase { std::vector inputs(in_arg_names.size()); for (size_t i = 0; i < in_arg_names.size(); ++i) { - auto &in_tensor = - scope.FindVar(in_arg_names[i])->Get(); + auto in_var = scope.FindVar(in_arg_names[i]); + if (in_var == nullptr) { + continue; + } + auto &in_tensor = in_var->Get(); + if (!in_tensor.IsInitialized()) { + continue; + } if (platform::is_gpu_place(in_tensor.place())) { framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]); } else { @@ -107,8 +204,9 @@ class PyFuncOp : public framework::OperatorBase { std::vector outputs(out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { + auto *out_var = scope.FindVar(out_arg_names[i]); auto *out_tensor = - scope.FindVar(out_arg_names[i])->GetMutable(); + out_var ? out_var->GetMutable() : nullptr; outputs[i] = out_tensor; } @@ -117,7 +215,7 @@ class PyFuncOp : public framework::OperatorBase { auto *py_callable = GetPythonCallableObject(handle_idx); VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " << handle_idx; - DoCallPythonFunc(py_callable, token, inputs, &outputs); + CallPythonFunc(py_callable, token, inputs, &outputs); } }; @@ -127,5 +225,4 @@ class PyFuncOp : public framework::OperatorBase { namespace ops = paddle::operators; REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, - ops::PyFuncOpShapeInference, - paddle::framework::EmptyGradOpMaker); + ops::PyFuncOpShapeInference, ops::PyFuncOpGradDescMaker); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index ac406b27b5c..4b218fb3a2a 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -328,7 +328,7 @@ void BindOpDesc(pybind11::module *m) { .def("infer_var_type", &pd::OpDesc::InferVarType) .def("set_is_target", &pd::OpDesc::SetIsTarget) .def("serialize_to_string", SerializeMessage) - .def("block", &pd::OpDesc::Block, + .def("block", [](pd::OpDesc &self) { return self.Block(); }, pybind11::return_value_policy::reference); } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 92cd53a6c36..66c98c935d7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9096,12 +9096,9 @@ def py_func(func, x, out, backward_func=None): _main_program_to_register = dict() @classmethod - def get_instance(cls, prog=None): - if prog is None: - prog = fluid.default_main_program() - + def get_instance(cls, prog): if not isinstance(prog, Program): - raise ValueError("prog must be None or type of Program") + raise TypeError("prog must be type of Program") ret = cls._main_program_to_register.get(prog, None) if ret is None: @@ -9155,6 +9152,10 @@ def py_func(func, x, out, backward_func=None): ret = [] for i in six.moves.range(len(ret0)): + if ret0[i] is None: + ret.append(None) + continue + if isinstance(ret0[i], core.LoDTensor): ret.append(ret0[i]) continue @@ -9175,20 +9176,34 @@ def py_func(func, x, out, backward_func=None): x = [x] if isinstance(out, Variable): - out = [out] + out_list = [out] + else: + out_list = out + + if func is None or not hasattr(func, '__call__'): + raise TypeError('Input func must be a function') - for each_out in out: + if backward_func is not None and not hasattr(backward_func, '__call__'): + raise TypeError('Input backward_func must be a function') + + for each_out in out_list: if len(each_out.shape) == 0: raise ValueError( - 'users should infer shapes of outputs of py_func op manually') + 'Output shapes of py_func op should be provided by users manually' + ) py_func_reg = PyFuncRegister.get_instance(helper.main_program) - token = py_func_reg.unique_token(func) + forward_token = py_func_reg.unique_token(func) + backward_token = py_func_reg.unique_token( + backward_func) if backward_func is not None else '' helper.append_op( type='py_func', inputs={'X': x}, - outputs={'Out': out}, - attrs={'handle_idx': py_func_reg.handle_idx, - 'token': token}) + outputs={'Out': out_list}, + attrs={ + 'handle_idx': py_func_reg.handle_idx, + 'token': forward_token, + 'backward_token': backward_token + }) return out -- GitLab From c00e07cda02ce611f0c10ed9cbc64f9a59f42f73 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 14:58:51 +0800 Subject: [PATCH 0148/2367] Fix distribute compile test=develop --- .../fluid/framework/details/reduce_op_handle.cc | 10 +++++----- paddle/fluid/operators/distributed/grpc_serde.cc | 3 +-- .../operators/distributed/sendrecvop_utils.cc | 6 ++---- .../operators/distributed/sendrecvop_utils.h | 13 +++++++------ .../operators/distributed/variable_response.cc | 15 +++++++-------- .../operators/distributed_ops/merge_ids_op.cc | 4 +--- .../distributed_ops/ref_by_trainer_id_op.cc | 4 +--- 7 files changed, 24 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 85d8abc9100..7a5f7de57ef 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -218,18 +218,18 @@ void ReduceOpHandle::RunImpl() { } #if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE - if (framework::IsType(in_selected_rows[0]->value().type())) { + if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP32) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); - } else if (framework::IsType( - in_selected_rows[0]->value().type())) { + } else if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP64) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); } else { - PADDLE_ENFORCE(false, - "only support double or float when gahter SelectedRows"); + PADDLE_THROW("only support double or float when gather SelectedRows"); } #endif }); diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 31fac2133cf..94bf0a113be 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -122,8 +122,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 6ba883ba01f..5fd42e884ab 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -61,8 +61,7 @@ TensorPayload GetTensorPayload(framework::Variable* var, auto tensor = var->Get(); // FIXME(wuyi): data types in send_recv.proto is copied from // framework.proto - request->set_data_type( - static_cast(framework::ToDataType(tensor.type()))); + request->set_data_type(static_cast(tensor.type())); for (auto& dim : framework::vectorize(tensor.dims())) { request->add_dims(dim); } @@ -83,8 +82,7 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request) { auto* slr = var->GetMutable(); - request->set_data_type( - static_cast(framework::ToDataType(slr->value().type()))); + request->set_data_type(static_cast(slr->value().type())); request->set_lod_level(0); request->set_slr_height(slr->height()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 523e56fe3e4..710c8391660 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -58,18 +58,19 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); -inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { +inline framework::proto::VarType::Type ToVarType( + sendrecv::VariableMessage::Type type) { switch (type) { case sendrecv::VariableMessage::FP32: - return typeid(float); // NOLINT + return framework::proto::VarType::FP32; // NOLINT case sendrecv::VariableMessage::FP64: - return typeid(double); // NOLINT + return framework::proto::VarType::FP64; // NOLINT case sendrecv::VariableMessage::INT32: - return typeid(int); // NOLINT + return framework::proto::VarType::INT32; // NOLINT case sendrecv::VariableMessage::INT64: - return typeid(int64_t); // NOLINT + return framework::proto::VarType::INT64; // NOLINT case sendrecv::VariableMessage::BOOL: - return typeid(bool); // NOLINT + return framework::proto::VarType::BOOL; // NOLINT default: PADDLE_THROW("Not support type %d", type); } diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 5b2be04e6a1..921c96b5839 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -114,7 +114,7 @@ bool VariableResponse::CopyLodTensorData( tensor->set_lod(lod); void* tensor_data = - tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); + tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() << ", Buffer Size = " << length; @@ -139,13 +139,13 @@ bool VariableResponse::CopySelectRowsTensorData( slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); - PADDLE_ENFORCE_EQ(static_cast(tensor->numel()), - length / framework::SizeOfType( - paddle::operators::distributed::ToTypeIndex( - meta_.data_type()))); + PADDLE_ENFORCE_EQ( + static_cast(tensor->numel()), + length / framework::SizeOfType(paddle::operators::distributed::ToVarType( + meta_.data_type()))); void* tensor_data = tensor->mutable_data( ctx.GetPlace(), - paddle::operators::distributed::ToTypeIndex(meta_.data_type())); + paddle::operators::distributed::ToVarType(meta_.data_type())); if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { return false; @@ -159,8 +159,7 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto* slr = GetVar()->GetMutable(); slr->mutable_rows()->clear(); - slr->mutable_rows()->resize(length / - framework::SizeOfType(typeid(int64_t))); // int64 + slr->mutable_rows()->resize(length / sizeof(int64_t)); // int64 int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc index 252a63cb605..da0185b8c49 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc @@ -108,9 +108,7 @@ class MergeIdsOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.MultiInput("X").front()->type()), - ctx.GetPlace()); + ctx.MultiInput("X").front()->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc index 98b0af7688b..7e16e6ff66b 100644 --- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc +++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc @@ -42,9 +42,7 @@ class RefByTrainerIdOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.MultiInput("X")[0]->type()), - ctx.GetPlace()); + ctx.MultiInput("X")[0]->type(), ctx.GetPlace()); } }; -- GitLab From 6951ef9a55768c8e923623431247825a40bd522a Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 12 Dec 2018 15:25:07 +0800 Subject: [PATCH 0149/2367] Fix the gelu backward to avoid nan (#14857) * Fix the gelu backward to avoid nan test=develop * Remove unnecessary calls test=develop --- paddle/fluid/operators/activation_op.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 87d549678a0..c7df3ea58a9 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -301,23 +301,22 @@ template struct GeluFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { - auto temp = - ((x * static_cast(M_SQRT1_2)).erf()).template cast().eval(); + auto temp = (x * static_cast(M_SQRT1_2)).erf(); out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); } }; template struct GeluGradFunctor : BaseActivationFunctor { - bool Inplace() const { return IsInplace("gelu"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - ((-static_cast(0.5) * x.square()).exp())) - .template cast() - .eval(); - dx.device(d) = dout * (out / x + temp); + auto first = static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); } }; -- GitLab From faffc25c19cdc9504214a4c0c85aa131a44079de Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 15:28:27 +0800 Subject: [PATCH 0150/2367] fix hadoop home bug & refine setup.py --- python/paddle/fluid/async_executor.py | 5 +---- python/setup.py.in | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2a6a11805e4..b077e1be7e4 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -153,13 +153,10 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): - #hadoop_home = "$HADOOP_HOME" + def download_data(self, afs_path, local_path, fs_default_name, ugi, hadoop_home="$HADOOP_HOME", process_num=12): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') - hadoop_home = "~/tools/hadoop-xingtian/hadoop/" - configs = { "fs.default.name": fs_default_name, "hadoop.job.ugi": ugi diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54e..9418804be20 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -103,8 +103,10 @@ packages=['paddle', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', + 'paddle.fluid.distributed', 'paddle.fluid.layers', 'paddle.fluid.contrib', + 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', -- GitLab From 194ce2e92cf2e77c0f3e544c2b61204044d0af86 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 05:48:02 +0000 Subject: [PATCH 0151/2367] add benchmark --- paddle/fluid/operators/jit/CMakeLists.txt | 3 +- paddle/fluid/operators/jit/benchmark.cc | 152 ++++++++++++++++++++++ paddle/fluid/operators/jit/gen/jitcode.h | 4 - paddle/fluid/operators/jit/helper.h | 4 + paddle/fluid/operators/jit/test.cc | 9 -- 5 files changed, 158 insertions(+), 14 deletions(-) create mode 100644 paddle/fluid/operators/jit/benchmark.cc diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 26903e0e44e..0f213c5898f 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -8,7 +8,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -list(REMOVE_ITEM jit_kernel_cc_srcs test.cc) +list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) # refer must go first @@ -20,3 +20,4 @@ endif() cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) +cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc new file mode 100644 index 00000000000..5a276172c39 --- /dev/null +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +// #include // for memcpy +// #include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" + +DEFINE_int32(burning, 10, "Burning times."); +DEFINE_int32(repeat, 3000, "Repeat times."); +DEFINE_int32(max_size, 1000, "The Max size would be tested."); + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +std::vector TestSizes() { + std::vector s; + for (int i = 1; i <= FLAGS_max_size; ++i) { + s.push_back(i); + } + return s; +} + +// return this function avg time +template +double BenchTartgetFunc(const Func tgt, const std::vector& x, + const std::vector& y, std::vector& z) { // NOLINT + const T* x_data = x.data(); + const T* y_data = y.data(); + const int d = z.size(); + T* z_data = z.data(); + + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(x_data, y_data, z_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(x_data, y_data, z_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +// Benchmark all jit kernels including jitcode, mkl and refer. +// To use this tool, run command: ./benchmark [options...] +// Options: +// --burning: the burning time before count +// --repeat: the repeat times +// --max_size: the max size would be tested +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + using T = float; + using PlaceType = paddle::platform::CPUPlace; + namespace jit = paddle::operators::jit; + const auto KT = jit::vmul; + LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat + << " times."; + for (int d : TestSizes()) { + // for (kernels type) { // TODO(TJ): more jit::KernelType + std::vector> infos; + std::vector x(d), y(d), z(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + // refer + auto refer = jit::GetRefer::func_type, + jit::VMulTuples::attr_type>(); + if (refer) { + auto res = + BenchTartgetFunc::func_type>(refer, x, y, z); + infos.push_back(std::make_pair("Refer", res)); + } + + // test jitcode + auto jitcode = jit::GetJitCode::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (jitcode) { + auto res = + BenchTartgetFunc::func_type>(jitcode, x, y, z); + infos.push_back(std::make_pair("JitCode", res)); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast::func_type, + jit::VMulTuples::attr_type>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = + BenchTartgetFunc::func_type>(more, x, y, z); + infos.push_back(std::make_pair("More", res)); + } + } + } + + // Test result from Get function + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchTartgetFunc::func_type>(tgt, x, y, z); + infos.push_back(std::make_pair("Target", res)); + + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << KT << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + // } + } +} diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index caa3ef9dda7..765952fc352 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -60,10 +60,6 @@ typedef enum { #define YMM_FLOAT_BLOCK 8 #define ZMM_FLOAT_BLOCK 16 -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index b7580f6efb4..16cd18e2ccc 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -28,6 +28,10 @@ namespace paddle { namespace operators { namespace jit { +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + template inline Func GetJitCode(Attr attr) { diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e523089101f..1ee6ce6b13b 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -21,15 +21,6 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" - -constexpr int repeat = 20000; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), -- GitLab From bc0df6a9487f0aa877647b2a15789fe3eb206616 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 07:54:10 +0000 Subject: [PATCH 0152/2367] make typename tuples --- paddle/fluid/operators/jit/benchmark.cc | 15 +++++--------- paddle/fluid/operators/jit/helper.h | 24 ++++++++++++----------- paddle/fluid/operators/jit/kernel_base.h | 15 +++++++++----- paddle/fluid/operators/jit/more/mkl/mkl.h | 3 +-- paddle/fluid/operators/jit/refer/refer.h | 3 +-- paddle/fluid/operators/jit/test.cc | 15 +++++--------- 6 files changed, 35 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5a276172c39..ef7ccc64adf 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -94,8 +94,7 @@ int main(int argc, char* argv[]) { RandomVec(d, x.data()); RandomVec(d, y.data()); // refer - auto refer = jit::GetRefer::func_type, - jit::VMulTuples::attr_type>(); + auto refer = jit::GetRefer>(); if (refer) { auto res = BenchTartgetFunc::func_type>(refer, x, y, z); @@ -103,8 +102,7 @@ int main(int argc, char* argv[]) { } // test jitcode - auto jitcode = jit::GetJitCode::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { auto res = BenchTartgetFunc::func_type>(jitcode, x, y, z); @@ -118,10 +116,8 @@ int main(int argc, char* argv[]) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = - dynamic_cast::func_type, - jit::VMulTuples::attr_type>*>( - impl.get()); + auto i = dynamic_cast>*>( + impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); auto res = @@ -132,8 +128,7 @@ int main(int argc, char* argv[]) { } // Test result from Get function - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 16cd18e2ccc..11bbd6a56cf 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -32,9 +32,11 @@ namespace jit { #define SIGMOID_THRESHOLD_MAX 13.0 #define EXP_MAX_INPUT 40.0 -template -inline Func GetJitCode(Attr attr) { +template +inline typename KernelTuples::func_type GetJitCode( + typename KernelTuples::attr_type attr) { + using Func = typename KernelTuples::func_type; + using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { @@ -65,8 +67,8 @@ inline Func GetJitCode(Attr attr) { // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline Func GetRefer() { +template +inline typename KernelTuples::func_type GetRefer() { auto& ref_pool = ReferKernelPool().Instance().AllKernels(); KernelKey kkey(KT, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); @@ -74,7 +76,7 @@ inline Func GetRefer() { "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { return i->GetFunc(); } @@ -82,10 +84,10 @@ inline Func GetRefer() { return nullptr; } -template -Func Get(Attr attr) { - auto jitfunc = GetJitCode(attr); +typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { + auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; } @@ -97,7 +99,7 @@ Func Get(Attr attr) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } @@ -105,7 +107,7 @@ Func Get(Attr attr) { } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + return GetRefer(); } } // namespace jit diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index df7be6ab8ec..84f03088985 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -36,10 +36,13 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelImpl : public Kernel { + using T = typename KernelTuples::data_type; + using Func = typename KernelTuples::func_type; + using Attr = typename KernelTuples::attr_type; + public: - using ELEMENT_TYPE = T; virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; @@ -47,11 +50,13 @@ class KernelImpl : public Kernel { Func func{nullptr}; }; -template -class ReferKernel : public KernelImpl { +template +class ReferKernel : public KernelImpl { public: // Refer code can always be used - bool UseMe(Attr attr) const override { return true; } + bool UseMe(typename KernelTuples::attr_type attr) const override { + return true; + } }; } // namespace jit diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index c0f738ccebe..56469b054de 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,8 +28,7 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl::func_type, - typename VMulTuples::attr_type> { +class VMulKernel : public KernelImpl> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 97aa5de8fcf..99d1cbd43ec 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -29,8 +29,7 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel::func_type, - typename VMulTuples::attr_type> { +class VMulKernel : public ReferKernel> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 1ee6ce6b13b..4d7970414ff 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -89,8 +89,7 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jit; const auto KT = jit::vmul; for (int d : TestSizes()) { - auto ref = jit::GetRefer::func_type, - jit::VMulTuples::attr_type>(); + auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -115,8 +114,7 @@ TEST(JitKernel, vmul) { ExpectEQ(yinp_data, zref_data, d); // test jitcode - auto jitcode = jit::GetJitCode::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { VLOG(10) << "Test jitcode, size: " << d; TestTartgetFunc::func_type>(jitcode, x, y, zref); @@ -129,10 +127,8 @@ TEST(JitKernel, vmul) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = - dynamic_cast::func_type, - jit::VMulTuples::attr_type>*>( - impl.get()); + auto i = dynamic_cast>*>( + impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; @@ -142,8 +138,7 @@ TEST(JitKernel, vmul) { } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); TestTartgetFunc::func_type>(tgt, x, y, zref); } } -- GitLab From 7a43e5170325f3a78e026bb4d7039e0c25be8686 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:16:26 +0800 Subject: [PATCH 0153/2367] Add gperf tools --- CMakeLists.txt | 6 ++++ cmake/generic.cmake | 16 +++++++++++ paddle/fluid/framework/parallel_executor.cc | 31 ++++++++++++++++++++- python/paddle/fluid/__init__.py | 3 +- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba24..3e59aca2d93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,12 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 312fbaa0b3d..a8b9dcfcf5e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + # find all third_party modules is used for paddle static library # for reduce the dependency when building the inference libs. set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -274,6 +282,7 @@ function(cc_library TARGET_NAME) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) endif() # cpplint code style @@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endfunction(cc_binary) @@ -362,6 +372,7 @@ function(cc_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(nv_binary) @@ -433,6 +445,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) @@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(hip_binary) @@ -518,6 +532,7 @@ function(hip_test TARGET_NAME) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) @@ -560,6 +575,7 @@ function(go_library TARGET_NAME) endif() if(go_library_DEPS) add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) endif(go_library_DEPS) # The "source file" of the library is `${dummyfile}` which never diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee772..28a4b14b27b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -30,13 +30,36 @@ limitations under the License. */ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +DEFINE_string(pe_profile_fname, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); + namespace paddle { namespace framework { +static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gProfileStarted = false; +#endif class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places) {} + : places_(places) { + if (!FLAGS_pe_profile_fname.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_pe_profile_fname.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_pe_profile_fname will be ignored"; +#endif + }); + } + } ~ParallelExecutorPrivate() { if (own_local_scope_) { @@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + platform::RecordBlock b(0); #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188e..4cf0784d817 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -125,7 +125,8 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') -- GitLab From f3250097bccbade4168b02417e04fee5cd990494 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 08:20:25 +0000 Subject: [PATCH 0154/2367] fix bug and mac compile --- paddle/fluid/operators/jit/benchmark.cc | 3 +-- paddle/fluid/operators/jit/helper.h | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index ef7ccc64adf..5cc82b69f8b 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -12,9 +12,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -// #include // for memcpy -// #include #include +#include #include #include #include "gflags/gflags.h" diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 11bbd6a56cf..d1bbe103814 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -49,15 +49,17 @@ inline typename KernelTuples::func_type GetJitCode( // pool: (KernelKey(type, place), vector) auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); auto iter = creator_map.find(kkey); - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; + if (iter != creator_map.end()) { + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } } } } -- GitLab From 009c7cf6ccf3f8ece6922d532df38cadd3ca5c84 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 16:23:50 +0800 Subject: [PATCH 0155/2367] add finialize --- python/paddle/fluid/contrib/utils/__init__.py | 2 +- python/paddle/fluid/distributed/helper.py | 3 +++ python/paddle/fluid/distributed/ps_instance.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 2fe9f702f3d..20b2cc381aa 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -18,5 +18,5 @@ from __future__ import print_function from . import hdfs_utils from .hdfs_utils import * -__all__ = lookup_table_utils.__all__ +#__all__ = lookup_table_utils.__all__ __all__ = hdfs_utils.__all__ diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 4cc5eb2a920..1244b4c0cad 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -44,5 +44,8 @@ class MPIHelper(object): def get_hostname(self): import socket return socket.gethostname() + + def finalize(self): + MPI.Finalize() diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 94e123c2ceb..dce5dfc5bd6 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -97,6 +97,7 @@ class PaddlePSInstance(object): pass def finalize(self): + self.dh.finalize() pass -- GitLab From b75bd29c3ae74b5d48d573916eebab6473b3c30f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:51:01 +0800 Subject: [PATCH 0156/2367] Remove debug info --- .../details/computation_op_handle.cc | 45 +---- .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 132 +++++++++------ paddle/fluid/framework/operator.cc | 160 +++++++----------- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 +++++---- 6 files changed, 224 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 90030334383..7ad1e40c600 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,46 +26,17 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} -struct RecordTime { - RecordTime(const std::string &name, const std::string &type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - ~RecordTime() { - if (type_ == "elementsize_add") { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void ComputationOpHandle::RunImpl() { - { - RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); - WaitInputVarGenerated(place_); - } - - Scope *scope = nullptr; - { - RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); - scope = scope_->FindVar(kLocalExecScopeName)->Get(); - } - - { - RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); + WaitInputVarGenerated(place_); - auto run_func = [this, scope]() { op_->Run(*scope, place_); }; + auto run_func = [this]() { + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); + }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 5997f12ffab..4822627ac3b 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index dfa310a3863..9ebf1366986 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -20,6 +20,10 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" +DEFINE_bool(enforce_when_check_program, true, + "Checking whether the program is correct or not. We will log " + "errors rather than throwing exceptions if this flag turned off"); + namespace paddle { namespace framework { namespace ir { @@ -28,55 +32,85 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) -// std::map visit; -// for (OpDesc *op : program.Block(0).AllOps()) { -// // For backward compatibility, some program doesn't have role added. -// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; -// int role_id = -// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); -// visit[role_id] = true; -// switch (role_id) { -// case _INT(OpRole::kForward): -// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { -// LOG(ERROR) -// << "Cannot add backward operator before forward operator %s." -// << op->Type(); -// } -// break; -// case _INT(OpRole::kBackward): -// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add backward operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | -// _INT(OpRole::kLoss)) == visit.end(), -// "Cannot add backward|loss operator before " -// "forward|loss operator %s.", -// op->Type()); -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add forward|loss operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kOptimize): -// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), -// "Optimize operators %s must follow backward operator.", -// op->Type()); -// break; -// case _INT(OpRole::kLRSched): -// case _INT(OpRole::kDist): -// case _INT(OpRole::kRPC): -// case _INT(OpRole::kNotSpecified): -// break; -// default: -// LOG(FATAL) << "Unknown operator role. Don't add new role because " -// "you don't know what you are doing."; -// } -// } + std::map visit; + for (OpDesc *op : program.Block(0).AllOps()) { + // For backward compatibility, some program doesn't have role added. + if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; + int role_id = + boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + visit[role_id] = true; + switch (role_id) { + case _INT(OpRole::kForward): + if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); + } + break; + case _INT(OpRole::kBackward): + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator %s after optimize operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kForward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator %s.", + op->Type()); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add forward|loss operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) != + visit.end()) { + LOG(ERROR) << "Cannot add backward|loss operator before " + << "forward|loss operator %s." << op->Type(); + } + + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " + "operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kOptimize): + case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators %s must follow backward operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { + LOG(ERROR) + << "Optimize operators %s must follow backward operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kLRSched): + case _INT(OpRole::kDist): + case _INT(OpRole::kRPC): + case _INT(OpRole::kNotSpecified): + break; + default: + LOG(FATAL) << "Unknown operator role. Don't add new role because " + "you don't know what you are doing."; + } + } #undef _INT } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b8adce4edf1..c6f3254e9f7 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,125 +701,85 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } -struct RecordTime { - RecordTime(const std::string& name, const std::string& type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - void inline stop() { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - - ~RecordTime() { - if (type_ == "elementwise_add") { - stop(); - } - // stop(); - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RecordTime rt("OperatorWithKernel::All", type_); - { - RecordTime rt("OperatorWithKernel::InferShape", type_); - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - } - - { - RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", - type_); - } + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } - OpKernelMap& kernels = kernels_iter->second; + OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + // for (auto& candidate : kKernelPriority) { + // Do selection + // } - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - auto kernel_iter = kernels.find(expected_kernel_key); + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - Scope* transfer_scope = nullptr; - // auto* transfer_scope = - // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + auto* transfer_scope = + TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = scope; - // const Scope& exec_scope = - // (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = + (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } - delete rt_1; + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } - RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - delete rt_2; + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, - var->Get().value()); - } + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get().value()); } } - delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 181baac870a..87bf7c6b156 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,37 +33,34 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - if (!ctx->IsRuntime()) { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the " - "received is %s [%s]", - ctx->GetInputsVarType("Y").front(), - ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); - } + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE( + ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); } ctx->ShareDim("X", /*->*/ "Out"); @@ -128,7 +125,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -138,10 +135,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -155,7 +152,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index bc1b20321f1..5710cda39ac 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,57 +23,56 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - // PADDLE_ENFORCE(ctx->HasInput("Param"), - // "Input(Param) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Grad"), - // "Input(Grad) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment1"), - // "Input(Moment1) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment2"), - // "Input(Moment2) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - // "Input(LearningRate) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - // "Input(Beta1Pow) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - // "Input(Beta2Pow) of AdamOp should not be null."); - - // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - // "Output(ParamOut) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - // "Output(Moment1Out) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - // "Output(Moment2Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - // "Learning rate should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - // "Beta1 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - // "Beta2 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - // if (ctx->GetInputsVarType("Grad")[0] == - // framework::proto::VarType::LOD_TENSOR) { - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Grad"), - // "Param and Grad input of AdamOp should have same dimension"); - // } - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment1"), - // "Param and Moment1 input of AdamOp should have same dimension"); - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment2"), - // "Param and Moment2 input of AdamOp should have same dimension"); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + } + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment1 input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = -- GitLab From 1b61021cb36eae45e142a953c2c96cf46853aa7c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 17:02:24 +0800 Subject: [PATCH 0157/2367] Polish code --- paddle/fluid/framework/ir/graph.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 9ebf1366986..db74d5674a4 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -57,7 +57,7 @@ void CheckProgram(const ProgramDesc &program) { } else { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) - << "Cannot add backward operator %s after optimize operator.", + << "Cannot add backward operator %s after optimize operator." << op->Type(); } } @@ -82,8 +82,8 @@ void CheckProgram(const ProgramDesc &program) { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " - "operator.", - << op->Type(); + "operator." + << op->Type(); } } break; @@ -95,9 +95,8 @@ void CheckProgram(const ProgramDesc &program) { op->Type()); } else { if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { - LOG(ERROR) - << "Optimize operators %s must follow backward operator.", - << op->Type(); + LOG(ERROR) << "Optimize operators %s must follow backward operator." + << op->Type(); } } break; -- GitLab From e82772f42518f1cff790ac04aa1c73c2e5b201e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 09:22:44 +0000 Subject: [PATCH 0158/2367] fix cmake conflict test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b1cfb23f3a8..6d7a69c8c9e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -169,7 +169,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() -- GitLab From 8d9401152eaf26cc0d6ab4643fe6255028d6edf2 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 17:43:39 +0800 Subject: [PATCH 0159/2367] Refine w2v --- .../fluid/operators/math/matrix_bit_code.cc | 22 ++++++---- paddle/fluid/operators/math/matrix_bit_code.h | 40 +++++++++---------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f87..dbf4f5e3325 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -23,12 +23,14 @@ void MatrixBitCodeFunctor::Add(const framework::Tensor& vec, framework::Tensor* tmat) { size_t batch_size = tmat->dims()[0]; size_t width = tmat->dims()[1]; + auto* tmat_data = tmat->data(); + auto* vec_data = vec.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); - tmat->data()[i * width + j] += vec.data()[index]; + tmat_data[i * width + j] += vec_data[index]; } } } @@ -38,12 +40,14 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, framework::Tensor* vec) { size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; + auto* vec_data = vec->data(); + auto* tmat_data = tmat.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); - vec->data()[index] += tmat.data()[i * width + j]; + vec_data[index] += tmat_data[i * width + j]; } } } @@ -53,14 +57,15 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec) { size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; + auto* vec_data = vec->mutable_value()->data(); + auto* tmat_data = tmat.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; + vec_data[row_index] += tmat_data[i * width + j]; } } } @@ -70,6 +75,8 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { size_t num_samples = tmat.dims()[0]; size_t o_width = tmat.dims()[1]; + auto* tmat_data = tmat.data(); + auto* sum_data = sum->data(); for (size_t i = 0; i < num_samples; ++i) { T sm = static_cast(0.0); auto code = code_table_->get_code(i); @@ -78,10 +85,10 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, if (code->calc_bit(j)) { // calc_bit starts from right most bit, while data in tmat[i] is in the // reverse order. - sm += tmat.data()[i * o_width + j]; + sm += tmat_data[i * o_width + j]; } } - sum->data()[i] = scale_sum * sm; + sum_data[i] = scale_sum * sm; } } @@ -217,12 +224,13 @@ template void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { size_t num_samples = tmat->dims()[0]; size_t o_width = tmat->dims()[1]; + auto* tmat_data = tmat->data(); for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { if (code->calc_bit(j)) { - tmat->data()[i * o_width + j] -= 1; + tmat_data[i * o_width + j] -= 1; } } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b4..ba1745b86da 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -140,13 +140,13 @@ template class CustomCode : public Code { public: CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) - : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + const int64_t* ids, int index) { + seq_len_ = ptable.dims()[1]; + ptable_data_ = ptable.data() + seq_len_ * index; + pcode_data_ = pcode.data() + seq_len_ * index; } /** - * Here the id of root shoud be 1 rather than 0, thus the encoding of class c + * Here the id of root should be 1 rather than 0, thus the encoding of class c * is `c + num_classes` and all siblings can get the same weight indice using * prefixes. * Weight index is the prefixes of encoding, thus leave out the right most @@ -154,26 +154,26 @@ class CustomCode : public Code { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_.data()[bit]; } - bool calc_bit(int bit) const { return pcode_.data()[bit]; } - int get_length() const { - int length = 0; + size_t calc_index(int bit) const override { return ptable_data_[bit]; } + bool calc_bit(int bit) const override { return pcode_data_[bit]; } - for (int i = 0; i < static_cast(ptable_.dims()[1]); i++) { - if (ptable_.data()[i] >= 0) { - length++; - } else { - return length; - } + // NOTE: this function is not thread-safe. + int get_length() const override { + if (length_ < 0) { + auto len = seq_len_; + length_ = + static_cast(std::find_if(ptable_data_, ptable_data_ + len, + [](const T& val) { return val < 0; }) - + ptable_data_); } - return length; + return length_; } private: - framework::Tensor ptable_; - framework::Tensor pcode_; - const int64_t* ids_; - const int index_; + int64_t seq_len_; + const T* ptable_data_; + const T* pcode_data_; + mutable int length_{-1}; }; class SimpleCodeTable : public CodeTable { -- GitLab From be113756610c2894ae2adfeab40c8dfe879620a9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 18:06:58 +0800 Subject: [PATCH 0160/2367] Refine code --- .../fluid/operators/math/matrix_bit_code.cc | 4 +-- paddle/fluid/operators/math/matrix_bit_code.h | 30 ++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index dbf4f5e3325..92affa0e4ed 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include +#include namespace paddle { namespace operators { namespace math { @@ -133,8 +134,7 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto weight_value = weight->data(); auto input_value = input.data(); - std::unordered_map>> ops; - + std::map>> ops; for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index ba1745b86da..cf43ad9d449 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include @@ -109,7 +110,7 @@ class Code { // set a CodeTable interface to create multiple code table class CodeTable { public: - virtual std::unique_ptr get_code(int64_t code) const = 0; + virtual Code* get_code(int64_t code) const = 0; virtual size_t size() const = 0; virtual int get_max_code_length() const = 0; virtual ~CodeTable() {} @@ -180,14 +181,23 @@ class SimpleCodeTable : public CodeTable { public: SimpleCodeTable(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new SimpleCode(code, num_classes_, ids_)); - return coder; + + Code* get_code(int64_t code) const { + auto it = codes_.find(code); + if (it != codes_.end()) { + return it->second.get(); + } + auto* result = new SimpleCode(code, num_classes_, ids_); + codes_.emplace(code, std::unique_ptr(result)); + return result; } + size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } private: + mutable std::map> codes_; + size_t num_classes_; const int64_t* ids_; }; @@ -199,9 +209,14 @@ class CustomCodeTable : public CodeTable { const framework::Tensor& pcode, const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); - return coder; + Code* get_code(int64_t code) const { + auto it = codes_.find(code); + if (it != codes_.end()) { + return it->second.get(); + } + auto* result = new CustomCode(ptable_, pcode_, ids_, code); + codes_.emplace(code, std::unique_ptr(result)); + return result; } size_t size() const { return static_cast(ptable_.dims()[1]); } @@ -210,6 +225,7 @@ class CustomCodeTable : public CodeTable { } private: + mutable std::unordered_map> codes_; const framework::Tensor& ptable_; const framework::Tensor& pcode_; const int64_t* ids_; -- GitLab From a61eb543f5796d9899bff073e5f6647bc1003d71 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 19:18:45 +0800 Subject: [PATCH 0161/2367] Add RWLock to Scope --- paddle/fluid/framework/rw_lock.h | 16 ++++++++++++---- paddle/fluid/framework/scope.cc | 11 ++++------- paddle/fluid/framework/scope.h | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dbf00f3a79f..dd918fcdfa6 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -16,7 +16,9 @@ limitations under the License. */ #if !defined(_WIN32) #include -#endif // !_WIN32 +#else +#include // NOLINT +#endif // !_WIN32 #include "paddle/fluid/platform/enforce.h" @@ -51,9 +53,15 @@ struct RWLock { // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { - void RDLock() {} - void WRLock() {} - void UNLock() {} + // FIXME(minqiyang): use mutex here to do fake lock + void RDLock() { mutex_.lock(); } + + void WRLock() { mutex_.lock(); } + + void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; }; #endif diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 61416676d63..190a057d9e4 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -46,13 +46,10 @@ DEFINE_double( #define SCOPE_READER_LOCK #define SCOPE_WRITER_LOCK #else -// TODO(minqiyang): use reader lock and writer lock in all platforms -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK -// #define SCOPE_READER_LOCK boost::shared_lock -// lock(mutex_); -// #define SCOPE_WRITER_LOCK boost::unique_lock -// lock(mutex_); +// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one +// in _WIN32 platform +#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); +#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); #endif namespace paddle { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57e..c140212c3e4 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -15,11 +15,11 @@ limitations under the License. */ #pragma once #include -#include // NOLINT #include #include #include +#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -123,7 +123,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable std::mutex mutex_; + mutable RWLock rw_lock_; }; // Generate some debug string about the inherience structure of scope, quite -- GitLab From 2c1e986f22c7535ffd420d9370f79cf93bd5bf25 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 19:21:31 +0800 Subject: [PATCH 0162/2367] barrier_all to barrier_worker --- python/paddle/fluid/async_executor.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index b077e1be7e4..af42d2912fd 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -170,7 +170,8 @@ class AsyncExecutor(object): self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, multi_processes=process_num) - self.instance.barrier_all() #wait for download_data #TODO only barriere worker + #self.instance.barrier_all() #wait for download_data #TODO only barriere worker + self.instance.barrier_worker() #wait for download_data #TODO only barriere worker def config_distributed_nodes(self): self.instance = ps_instance.PaddlePSInstance(1, 2) @@ -187,13 +188,13 @@ class AsyncExecutor(object): raise ValueError('instance is None, please run config_distributed_nodes init instance') return self.instance - def stop_server(self): + def stop(self): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') - self.instance.barrier_all() #worker do all things + self.instance.barrier_worker() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() - self.instance.barrier_all() #sync + self.instance.barrier_worker() #sync def init_server(self, dist_desc): if self.instance is None: @@ -205,10 +206,6 @@ class AsyncExecutor(object): ips = self.instance.gather_ips() self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start - self.instance.barrier_all() #wait init model - self.instance.barrier_all() #wait for download_data #TODO remove this after only barrier worker - self.instance.barrier_all() #wait worker do all things - self.instance.barrier_all() #sync def init_worker(self, dist_desc, startup_program): if self.instance is None: @@ -223,7 +220,7 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait all worker start if self.instance.is_first_worker(): self.executor.init_model() - self.instance.barrier_all() #wait init model + self.instance.barrier_worker() #wait init model def init_model(self): if self.instance is None: -- GitLab From c2e851f7b284ad122d20b932ff2df165d56b7994 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Dec 2018 11:42:16 +0000 Subject: [PATCH 0163/2367] test=develop, remove sparse bias and add prefetch and related tests --- .../distributed/parameter_prefetch.cc | 12 +- .../distributed/parameter_prefetch.h | 24 ++ .../operators/hierarchical_sigmoid_op.cc | 47 ++- .../fluid/operators/hierarchical_sigmoid_op.h | 83 ++++-- .../fluid/operators/math/matrix_bit_code.cc | 17 -- paddle/fluid/operators/math/matrix_bit_code.h | 27 +- python/paddle/fluid/layers/nn.py | 17 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 6 +- .../test_hsigmoid_remote_table_op.py | 271 ++++++++++++++++++ 9 files changed, 418 insertions(+), 86 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index aebf6376d16..52085482f47 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -32,7 +32,7 @@ namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; @@ -120,8 +120,8 @@ static void MergeMultipleVarsIntoOneBySection( PADDLE_ENFORCE_GT( out_tensor->numel(), 0, - "When calling this method, the Tensor's numel must larger than zero. " - "Please check Tensor::Resize has been called first."); + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); @@ -144,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( auto row_numel = dims[1]; - for (size_t i = 0; i < dims[0]; ++i) { + for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; auto& offsets = id_to_offset[origin_id]; @@ -201,7 +201,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); - for (size_t i = 0; i < id_tensor.numel(); ++i) { + for (int64_t i = 0; i < id_tensor.numel(); ++i) { ids_vector.push_back(id_data[i]); } } else { @@ -209,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto cpu_place = platform::CPUPlace(); - framework::Tensor cpu_tensor; + framework::LoDTensor cpu_tensor; auto* cpu_tensor_data = cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); auto stream = diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53482c4c40e..882c6bd9b84 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -30,6 +30,30 @@ void prefetch(const std::string& id_name, const std::string& out_name, const framework::ExecutionContext& context, const framework::Scope& scope); +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } +} + }; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 0dbcc442dfa..b9059f6b054 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("PreOut"), "Output(PreOut) should not be null."); + auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); + if (with_prefetch) { + PADDLE_ENFORCE(ctx->HasOutput("W_Out"), + "Output(W_Out) should not be null."); + } const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); @@ -96,7 +101,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); - AddInput("PTable", + AddInput("PathTable", "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); @@ -120,8 +125,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); + AddOutput( + "W_Out", + "(LoDTensor, optinal) using input 'W' as Output to make it mutable" + "When we are using prefetch") + .AsIntermediate(); AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of @@ -191,23 +218,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference << " is set to SelectedRows"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to SelectedRows"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); - } } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::LOD_TENSOR); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); - } + } + if (hasBias) { + VLOG(30) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; + block->Var(bias_grad_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89e..d8e406a96b6 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -49,13 +55,55 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + // for remote prefetch + + auto epmap = ctx.Attr>("epmap"); + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + auto height_sections = ctx.Attr>("height_sections"); + auto table_names = ctx.Attr>("table_names"); + VLOG(3) << "path type is " << path->type().name(); + std::vector real_rows = PathToRows(*path); + framework::Scope& local_scope = ctx.scope().NewScope(); + auto* ids = local_scope.Var("Ids@Prefetch"); + auto* x_tensor = ids->GetMutable(); + + x_tensor->mutable_data( + framework::make_ddim({static_cast(real_rows.size()), 1}), + ctx.GetPlace()); + // copy. + + std::memcpy(x_tensor->data(), real_rows.data(), + real_rows.size() * sizeof(int64_t)); + + framework::DDim w_dims = ctx.Input("W")->dims(); + w_dims[0] = x_tensor->dims()[0]; + auto* w_tensor = + local_scope.Var("W@Prefetch")->GetMutable(); + w_tensor->Resize(w_dims); + +#ifdef PADDLE_WITH_DISTRIBUTE + // w_Out is set to used by prefetch, never change it in other cases + auto* w_out = ctx.Output("W_Out"); + operators::distributed::prefetch_with_reconstruct( + "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, + ctx, local_scope, w_out); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } + bool is_custom = false; if (path) { is_custom = true; @@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); - auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); @@ -165,15 +212,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { pre_out_grad_mat * out_grad_mat.broadcast(bcast); // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } if (!is_sparse) { - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -192,21 +238,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->set_rows(real_rows); - // build ids -> rows index map - bias_grad->SyncIndex(); - bias_grad->set_height(bias->dims()[0]); - auto* bias_grad_value = bias_grad->mutable_value(); - std::vector dims = {static_cast(real_rows.size()), - bias->dims()[1]}; - bias_grad_value->mutable_data(framework::make_ddim(dims), - ctx.GetPlace()); - zero(dev_ctx, bias_grad_value, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } bit_code->MulGradWeight(pre_out_grad, w_grad, in); } bit_code->MulGradError(pre_out_grad, w, in_grad); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f87..fed4639b011 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -48,23 +48,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, } } -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; - } - } -} - template void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b4..0bc09bdb35c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -139,11 +139,11 @@ class SimpleCode : public Code { template class CustomCode : public Code { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, int index) : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + ptable_ = path_table.Slice(index, index + 1); + pcode_ = path_code.Slice(index, index + 1); } /** * Here the id of root shoud be 1 rather than 0, thus the encoding of class c @@ -195,9 +195,9 @@ class SimpleCodeTable : public CodeTable { template class CustomCodeTable : public CodeTable { public: - CustomCodeTable(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : ptable_(ptable), pcode_(pcode), ids_(ids) {} + CustomCodeTable(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : ptable_(path_table), pcode_(path_code), ids_(ids) {} std::unique_ptr get_code(int64_t code) const { std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); @@ -223,11 +223,11 @@ class MatrixBitCodeFunctor { ids_(ids), code_table_(new SimpleCodeTable(num_classes, ids)) {} - MatrixBitCodeFunctor(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : num_classes_(static_cast(ptable.dims()[1])), + MatrixBitCodeFunctor(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(new CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -238,11 +238,6 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); - /* For selected rows For j < code_length - vec(0, index(i, j)) += tmat(i, j) - */ - void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec); - /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 37ddfdf7d58..38dad857174 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4931,6 +4931,9 @@ def hsigmoid(input, pass weights = None + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True if not is_custom: weights = helper.create_parameter( @@ -4947,7 +4950,7 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": path_table, + "PathTable": path_table, "PathCode": path_code, "Label": label } @@ -4970,9 +4973,13 @@ def hsigmoid(input, type="hierarchical_sigmoid", inputs=inputs, outputs={"Out": out, - "PreOut": pre_out}, - attrs={"num_classes": num_classes, - "is_sparse": is_sparse}) + "PreOut": pre_out, + "W_Out": weights}, + attrs={ + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": remote_prefetch + }) return out @@ -7440,7 +7447,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): Examples: - .. code-block:: python + .. code-block:: python x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2a6c93f75fa..8ed5074dc26 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py new file mode 100644 index 00000000000..9ed6c94bd20 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_hsigmoid_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_hsigmoid_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_hsigmoid_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_hsigmoid_op_one_pserver(place, port0) + self._run_hsigmoid_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() -- GitLab From 5d3ecbfdf503965cc66eda6f8c75849ae0546c1e Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 19:48:30 +0800 Subject: [PATCH 0164/2367] fix hdfs bug --- python/paddle/fluid/contrib/utils/hdfs_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index 251665d85e1..ff1a2d3e4ad 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -52,9 +52,10 @@ class HDFSClient(object): ret_code = 0 ret_out = None ret_err = None + whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( - whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) (output, errors) = proc.communicate() ret_code, ret_out, ret_err = proc.returncode, output, errors if ret_code: -- GitLab From 106e28523641ef6bdffe301b2a63b6d0f13de29a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Dec 2018 19:57:49 +0800 Subject: [PATCH 0165/2367] add unittest for parllelgraph mode test=develop --- .../details/multi_devices_graph_pass.cc | 8 +- .../details/parallel_ssa_graph_executor.cc | 20 +-- paddle/fluid/framework/parallel_executor.cc | 2 +- paddle/fluid/operators/reader/ctr_reader.h | 2 +- .../unittests/parallel_executor_test_base.py | 164 +++++++++--------- .../unittests/test_parallel_executor_crf.py | 3 + .../unittests/test_parallel_executor_mnist.py | 38 ++-- .../test_parallel_executor_seresnext.py | 49 ++++-- .../test_parallel_executor_transformer.py | 6 +- 9 files changed, 164 insertions(+), 128 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index c16e3006d76..e264906b57f 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - // int num_trainers = Get(kNumTrainers); + int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -387,7 +387,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { + // insert synchronous ops at the backpropagation; and + // insert synchronous ops if the graph contains mutilple places. + if (!is_forwarding && + (places_.size() > 1 || num_trainers > 1 || + (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index f1a07edf088..214c2f76255 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -49,18 +49,18 @@ FeedFetchList ParallelSSAGraphExecutor::Run( for (size_t i = 0; i < places_.size(); ++i) { auto call = [this, i, &fetch_tensors]() -> FeedFetchList { - return executors_[i]->Run(fetch_tensors); + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); }; if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - try { - fetch_datas.emplace_back(std::move(call())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - break; - } + call(); } } @@ -69,11 +69,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - try { - fetch_datas.emplace_back(std::move(f.get())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - } + fetch_datas.emplace_back(std::move(f.get())); } } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b0cd1e8e908..8d35361eb65 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -87,7 +87,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, "You should set build_strategy.reduce with 'AllReduce' for " - "ParallelGraph executor type"); + "the ParallelGraph executor type"); } // Step 1. Bcast the params to devs. diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae12..517d6697443 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -48,7 +48,7 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, + int batch_size, size_t thread_num, const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c2..73b8fb74fa3 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,23 +26,26 @@ import sys __all__ = ['TestParallelExecutorBase'] +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - use_fast_executor=False, - enable_sequential_execution=False): + def check_network_convergence( + self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + exec_type=fluid.ExecutionStrategy().ExecutorType.Default, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -58,68 +61,69 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - main.random_seed = seed - - loss = method(use_feed=feed_dict is not None) - - optimizer().minimize(loss) - - if memory_opt: - fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + main.random_seed = seed + + loss = method(use_feed=feed_dict is not None) + + optimizer().minimize(loss) + + if memory_opt: + fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + exec_strategy.executor_type = exec_type + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.enable_sequential_execution = enable_sequential_execution + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 84b0aad8acb..d75761153c0 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -181,6 +181,9 @@ class TestCRFModel(unittest.TestCase): if core.is_compiled_with_cuda(): self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3eecc467015..3dddff0d99d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType def simple_fc_net(use_feed): @@ -99,7 +99,10 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + def check_simple_fc_convergence(self, + use_cuda, + use_reduce=False, + exec_type=ExecutorType.Default): if use_cuda and not core.is_compiled_with_cuda(): return @@ -110,19 +113,21 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_reduce=use_reduce) + use_reduce=use_reduce, + exec_type=exec_type) def test_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, ExecutorType.Default) + self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reduce + # use_cuda, use_reducea self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda): + def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -134,14 +139,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=False) + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=True) + use_parallel_executor=True, + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -151,10 +158,12 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + # FIXME(Yancey1989): ParallelGraph executor type support CPU mode + self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) - def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): + def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -165,12 +174,13 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_fast_executor=use_fast_executor) + exec_type=exec_type) def test_batchnorm_fc(self): for use_cuda in (False, True): - for use_fast_executor in (False, True): - self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + for exec_type in (ExecutorType.Default, ExecutorType.Experimental, + ExecutorType.ParallelGraph): + self.check_batchnorm_fc_convergence(use_cuda, exec_type) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb6386..bada38894f7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import math import os @@ -167,13 +167,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs=120): return decayed_lr -def optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - return optimizer +def optimizer(learning_rate=0.01, lr_scale=1.0): + def _opt(): + return fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate / lr_scale, + step_each_epoch=2, + epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + return _opt class TestResnet(TestParallelExecutorBase): @@ -216,7 +220,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer) + optimizer=optimizer()) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -225,7 +229,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer) + optimizer=optimizer()) for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) @@ -243,7 +247,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( @@ -254,7 +258,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): @@ -277,7 +281,9 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-6, + exec_type=ExecutorType.Default, + lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -295,8 +301,9 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer, - use_parallel_executor=False) + optimizer=optimizer(), + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -305,7 +312,8 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer) + optimizer=optimizer(lr_scale=lr_scale), + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -313,7 +321,14 @@ class TestResnet(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) + if core.is_compiled_with_cuda(): + self._check_resnet_convergence( + model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence( + model=SE_ResNeXt50Small, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph, + lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908c..b5ee72a24e6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import paddle import paddle.fluid.core as core @@ -173,6 +173,10 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence( + transformer, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) -- GitLab From 417d031f90162737ab40978773a325829b72c1a3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 09:53:50 +0000 Subject: [PATCH 0166/2367] add refer vadd, vaddrelu, vsub and tests and benchmark --- paddle/fluid/operators/jit/README.md | 6 +- paddle/fluid/operators/jit/benchmark.cc | 66 +++++++++--------- paddle/fluid/operators/jit/helper.cc | 43 ++++++++++++ paddle/fluid/operators/jit/helper.h | 2 + paddle/fluid/operators/jit/kernel_base.h | 4 +- paddle/fluid/operators/jit/more/mkl/mkl.h | 2 +- paddle/fluid/operators/jit/refer/refer.cc | 12 +++- paddle/fluid/operators/jit/refer/refer.h | 47 +++++++++++-- paddle/fluid/operators/jit/test.cc | 67 +++++++++++++------ .../fluid/operators/math/jit_kernel_refer.h | 30 --------- 10 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 paddle/fluid/operators/jit/helper.cc diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 12158bf9d03..c2e32cc49b2 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -41,6 +41,6 @@ PaddlePaddle/Paddle/paddle/fluid/ - 性能测试 # 如何添加新的算子 -TBD -## Use me -Add USE_JIT_KERNEL(yourname) to CMakefile. + +- 在`KernelType` 中添加 `your_key` +- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)` diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5cc82b69f8b..27a1ba7ba32 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -52,9 +52,10 @@ std::vector TestSizes() { } // return this function avg time -template -double BenchTartgetFunc(const Func tgt, const std::vector& x, - const std::vector& y, std::vector& z) { // NOLINT +template +double BenchTartgetFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + std::vector& z) { // NOLINT const T* x_data = x.data(); const T* y_data = y.data(); const int d = z.size(); @@ -71,40 +72,25 @@ double BenchTartgetFunc(const Func tgt, const std::vector& x, return (end - start) / FLAGS_repeat; } -// Benchmark all jit kernels including jitcode, mkl and refer. -// To use this tool, run command: ./benchmark [options...] -// Options: -// --burning: the burning time before count -// --repeat: the repeat times -// --max_size: the max size would be tested -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - using T = float; - using PlaceType = paddle::platform::CPUPlace; +template +void BenchXYZNKernel() { namespace jit = paddle::operators::jit; - const auto KT = jit::vmul; - LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat - << " times."; for (int d : TestSizes()) { - // for (kernels type) { // TODO(TJ): more jit::KernelType std::vector> infos; std::vector x(d), y(d), z(d); RandomVec(d, x.data()); RandomVec(d, y.data()); // refer - auto refer = jit::GetRefer>(); + auto refer = jit::GetRefer>(); if (refer) { - auto res = - BenchTartgetFunc::func_type>(refer, x, y, z); + auto res = BenchTartgetFunc>(refer, x, y, z); infos.push_back(std::make_pair("Refer", res)); } // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - auto res = - BenchTartgetFunc::func_type>(jitcode, x, y, z); + auto res = BenchTartgetFunc>(jitcode, x, y, z); infos.push_back(std::make_pair("JitCode", res)); } @@ -115,32 +101,50 @@ int main(int argc, char* argv[]) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast>*>( + auto i = dynamic_cast>*>( impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); - auto res = - BenchTartgetFunc::func_type>(more, x, y, z); + auto res = BenchTartgetFunc>(more, x, y, z); infos.push_back(std::make_pair("More", res)); } } } // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } - auto res = BenchTartgetFunc::func_type>(tgt, x, y, z); + auto res = BenchTartgetFunc>(tgt, x, y, z); infos.push_back(std::make_pair("Target", res)); // print std::ostringstream loginfos; - loginfos << "Kernel Type: " << KT << ", size " << d << ": "; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } LOG(INFO) << loginfos.str(); - // } } } + +// Benchmark all jit kernels including jitcode, mkl and refer. +// To use this tool, run command: ./benchmark [options...] +// Options: +// --burning: the burning time before count +// --repeat: the repeat times +// --max_size: the max size would be tested +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat + << " times."; + using T = float; + using PlaceType = paddle::platform::CPUPlace; + namespace jit = paddle::operators::jit; + BenchXYZNKernel(); + BenchXYZNKernel(); + BenchXYZNKernel(); + BenchXYZNKernel(); +} diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc new file mode 100644 index 00000000000..2260f0aed42 --- /dev/null +++ b/paddle/fluid/operators/jit/helper.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/helper.h" + +namespace paddle { +namespace operators { +namespace jit { + +const char* to_string(KernelType kt) { + switch (kt) { + case vmul: + return "vmul"; + case vadd: + return "vadd"; + case vaddrelu: + return "vaddrelu"; + case vsub: + return "vsub"; + case vscal: + return "vscal"; + case vexp: + return "vexp"; + default: + return "NOT JITKernel"; + } + return nullptr; +} + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d1bbe103814..124587b1430 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -112,6 +112,8 @@ typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { return GetRefer(); } +const char* to_string(KernelType kt); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 84f03088985..b2e9d639776 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace jit { -typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; +typedef enum { vmul = 0, vadd = 1, vaddrelu, vsub, vscal, vexp } KernelType; template -struct VMulTuples { +struct XYZNTuples { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 56469b054de..4173d1f3de0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,7 +28,7 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl> { +class VMulKernel : public KernelImpl> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index a987b5fca09..69d039422f3 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,5 +17,13 @@ namespace refer = paddle::operators::jit::refer; -REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, - refer::VMulKernel); +#define REGISTER_REFER_KERNEL(key, func) \ + REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ + refer::func##Kernel) + +REGISTER_REFER_KERNEL(vmul, VMul); +REGISTER_REFER_KERNEL(vadd, VAdd); +REGISTER_REFER_KERNEL(vaddrelu, VAddRelu); +REGISTER_REFER_KERNEL(vsub, VSub); + +#undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 99d1cbd43ec..4d4d308cbd1 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" @@ -21,6 +22,7 @@ namespace operators { namespace jit { namespace refer { +// Refer code only focus on correctness template void VMul(const T* x, const T* y, T* z, int n) { for (int i = 0; i < n; ++i) { @@ -29,10 +31,47 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel> { - public: - VMulKernel() { this->func = VMul; } -}; +void VAdd(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +template +void VAddRelu(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} + +template +void VSub(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] - y[i]; + } +} + +template +void VScal(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +#define DECLARE_REFER_KERNEL(name, tuples) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ + } + +DECLARE_REFER_KERNEL(VMul, XYZNTuples); +DECLARE_REFER_KERNEL(VAdd, XYZNTuples); +DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); +DECLARE_REFER_KERNEL(VSub, XYZNTuples); + +#undef DECLARE_REFER_KERNEL } // namespace refer } // namespace jit diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 4d7970414ff..dba7e754eae 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -48,18 +48,20 @@ void ExpectEQ(const T* target, const T* refer, int n) { std::vector TestSizes() { std::vector s; - for (int i = 1; i < 30; ++i) { + for (int i = 1; i < 10; ++i) { s.push_back(i); } - // test some large size - s.push_back(100); - s.push_back(1000); + // // test some large size + // s.push_back(100); + // s.push_back(1000); + // s.push_back(2000); return s; } -template -void TestTartgetFunc(const Func tgt, const std::vector& x, - const std::vector& y, const std::vector& zref) { +template +void TestTartgetFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(zref.size(), x.size()); EXPECT_EQ(zref.size(), y.size()); @@ -83,13 +85,13 @@ void TestTartgetFunc(const Func tgt, const std::vector& x, ExpectEQ(ztgt_data, zref_data, d); } -TEST(JitKernel, vmul) { - using T = float; - using PlaceType = paddle::platform::CPUPlace; +template +void TestXYZNKernel() { namespace jit = paddle::operators::jit; - const auto KT = jit::vmul; for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT) + << ", size: " << d; + auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -114,10 +116,10 @@ TEST(JitKernel, vmul) { ExpectEQ(yinp_data, zref_data, d); // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - VLOG(10) << "Test jitcode, size: " << d; - TestTartgetFunc::func_type>(jitcode, x, y, zref); + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestTartgetFunc>(jitcode, x, y, zref); } // test all impls in more @@ -127,20 +129,45 @@ TEST(JitKernel, vmul) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast>*>( + auto i = dynamic_cast>*>( impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; - TestTartgetFunc::func_type>(more, x, y, zref); + TestTartgetFunc>(more, x, y, zref); } } } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestTartgetFunc::func_type>(tgt, x, y, zref); + auto tgt = jit::Get, PlaceType>(d); + TestTartgetFunc>(tgt, x, y, zref); } } -TEST(JitKernel, pool) {} +TEST(JITKernel, vmul) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + // TODO(TJ): fix double issue + // TestXYZNKernel(); +} + +TEST(JITKernel, vadd) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vaddrelu) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vsub) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, pool) {} diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index e0b2e3c7fad..eaca02ba147 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -23,36 +23,6 @@ namespace operators { namespace math { namespace jitkernel { namespace refer { -/* Refer code only focus on correctness */ - -template -void VMul(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAdd(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddRelu(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VScal(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} template void VAddBias(const T* a, const T* x, T* y, int n) { -- GitLab From f95ee9c09fa8459516df47bf3e72f54ab19afa66 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Wed, 12 Dec 2018 20:32:15 +0800 Subject: [PATCH 0167/2367] fix nccl dist test acc (#14867) * fix nccl dist test acc test=develop * fix test=develop --- .../fluid/tests/unittests/dist_mnist.py | 2 +- .../fluid/tests/unittests/test_dist_base.py | 21 ++++++++++++------- .../fluid/tests/unittests/test_dist_mnist.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 1cda2711f76..1c45a10a9dd 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase): # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) - opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) # Reader train_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 26fa20291b5..cedb3383ed4 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2 class TestDistRunnerBase(object): - def get_model(self, batch_size=DEFAULT_BATCH_SIZE): + def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1): raise NotImplementedError( "get_model should be implemented by child classes.") @@ -56,6 +56,7 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): + self.lr = args.lr self.get_model(batch_size=args.batch_size) # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, @@ -71,6 +72,7 @@ class TestDistRunnerBase(object): exe.run(pserver_prog) def run_trainer(self, args): + self.lr = args.lr test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) @@ -189,6 +191,7 @@ def runtime_main(test_class): parser.add_argument( '--use_reader_alloc', action='store_true', required=False) parser.add_argument('--batch_size', required=False, type=int, default=2) + parser.add_argument('--lr', required=False, type=float, default=0.001) parser.add_argument( '--batch_merge_repeat', required=False, type=int, default=1) @@ -234,6 +237,7 @@ class TestDistBase(unittest.TestCase): self._dc_asgd = False # must use with async mode self._use_reader_alloc = True self._nccl2_mode = False + self._lr = 0.001 self._setup_config() self._after_setup_config() @@ -284,7 +288,8 @@ class TestDistBase(unittest.TestCase): batch_size=DEFAULT_BATCH_SIZE, batch_merge_repeat=1): - cmd = "%s %s --role trainer" % (self._python_interp, model) + cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model, + self._lr) if batch_size != DEFAULT_BATCH_SIZE: cmd += " --batch_size %d" % batch_size if batch_merge_repeat > 1: @@ -330,13 +335,13 @@ class TestDistBase(unittest.TestCase): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, ps0_ep, self._trainers) + 0, ps0_ep, self._trainers, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, ps1_ep, self._trainers) + 1, ps1_ep, self._trainers, self._lr) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -425,13 +430,13 @@ class TestDistBase(unittest.TestCase): worker_endpoints = self._ps_endpoints.split(",") w0_ep, w1_ep = worker_endpoints - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep) + 0, w0_ep, self._lr / 2) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep) + 1, w1_ep, self._lr / 2) if self._mem_opt: tr0_cmd += " --mem_opt" diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 630bed198f4..49a2ca40e3c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -36,7 +36,7 @@ class TestDistMnistNCCL2(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2Lars(TestDistBase): -- GitLab From a37038880e03bc0a985b547bae78f1d0a5d7b048 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 12:34:08 +0000 Subject: [PATCH 0168/2367] fix unit test with double type --- paddle/fluid/operators/jit/helper.h | 44 ++++++++++++++++++----------- paddle/fluid/operators/jit/test.cc | 13 ++++----- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 124587b1430..053e5ed0798 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -33,8 +33,11 @@ namespace jit { #define EXP_MAX_INPUT 40.0 template -inline typename KernelTuples::func_type GetJitCode( - typename KernelTuples::attr_type attr) { +inline typename std::enable_if< + std::is_same::value && + std::is_same::value, + typename KernelTuples::func_type>::type +GetJitCode(typename KernelTuples::attr_type attr) { using Func = typename KernelTuples::func_type; using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); @@ -45,21 +48,19 @@ inline typename KernelTuples::func_type GetJitCode( // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KT, PlaceType()); - if (std::is_same::value) { - // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); - auto iter = creator_map.find(kkey); - if (iter != creator_map.end()) { - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; - } + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + if (iter != creator_map.end()) { + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; } } } @@ -67,6 +68,15 @@ inline typename KernelTuples::func_type GetJitCode( return nullptr; } +template +inline typename std::enable_if< + !std::is_same::value || + !std::is_same::value, + typename KernelTuples::func_type>::type +GetJitCode(typename KernelTuples::attr_type attr) { + return nullptr; +} + // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace template diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index dba7e754eae..9ceca24079f 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -48,13 +48,13 @@ void ExpectEQ(const T* target, const T* refer, int n) { std::vector TestSizes() { std::vector s; - for (int i = 1; i < 10; ++i) { + for (int i = 1; i < 32; ++i) { s.push_back(i); } - // // test some large size - // s.push_back(100); - // s.push_back(1000); - // s.push_back(2000); + // test some large size + s.push_back(100); + s.push_back(1000); + s.push_back(2000); return s; } @@ -148,8 +148,7 @@ void TestXYZNKernel() { TEST(JITKernel, vmul) { namespace jit = paddle::operators::jit; TestXYZNKernel(); - // TODO(TJ): fix double issue - // TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, vadd) { -- GitLab From 8b9d33fa1e7c3d592d9f3976634eaf87155f1a49 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 12:32:26 +0000 Subject: [PATCH 0169/2367] add unittest and fix bug add API.spec test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/py_func_op.cc | 123 ++++++--- python/paddle/fluid/layers/nn.py | 242 +++++++++++------- .../fluid/tests/unittests/test_py_func_op.py | 145 +++++++++++ 4 files changed, 374 insertions(+), 137 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_py_func_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078eb..b3f7593be31 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -197,6 +197,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 46a6125f974..32c44c3bc22 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -26,26 +26,35 @@ namespace py = pybind11; static std::vector g_py_callables; +const char kForwardPythonCallableId[] = "forward_callable_id"; +const char kBackwardPythonCallableId[] = "backward_callable_id"; +const char kPyFuncBackwardSkipVars[] = "backward_skip_vars"; + size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } static py::object *GetPythonCallableObject(size_t i) { - PADDLE_ENFORCE_LT(i, g_py_callables.size()); + PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id"); return &g_py_callables[i]; } -void CallPythonFunc(py::object *callable, const std::string &func_token, +std::string PythonObjectToString(const py::object &py_callable) { + py::gil_scoped_acquire guard; + return py::str(*py_callable); +} + +void CallPythonFunc(py::object *callable, const std::vector &ins, std::vector *out) { - py::gil_scoped_acquire guard{}; + py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); } - auto ret = (*callable)(func_token, *in_args); + auto ret = (*callable)(*in_args); auto ret_tuple = py::cast(ret); PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); for (size_t i = 0; i < out->size(); ++i) { @@ -55,7 +64,7 @@ void CallPythonFunc(py::object *callable, const std::string &func_token, try { auto *out_tensor = py::cast(ret_tuple[i]); PADDLE_ENFORCE_NOT_NULL(out_tensor, - "Output tensor should not be nullptr"); + "Output tensor %d should not be nullptr", i); (*out)[i]->set_lod(out_tensor->lod()); (*out)[i]->ShareDataWith(*out_tensor); } catch (py::cast_error &) { @@ -69,26 +78,23 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(!ctx->IsRuntime(), "Infer shape cannot be called in runtime."); - PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); - PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + PADDLE_ENFORCE(ctx->HasInputs("X") || ctx->HasOutputs("Out"), + "Input(X) or Output(Out) must exist"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, + "Function id cannot be less than 0"); auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); - // No need to infer shape in forward part - if (block->ForwardBlockID() < 0) { - return; - } - - PADDLE_ENFORCE(!ctx->Attrs().Get("token").empty(), - "Function token cannot be empty"); - const std::string kGradVarSuffix = framework::kGradVarSuffix; auto out_vars = ctx->GetOutputVarPtrs("Out"); for (auto &out_var : out_vars) { auto *out_var_desc = boost::get(out_var); + if (out_var_desc == nullptr) { + continue; + } auto out_name = out_var_desc->Name(); if (out_name == framework::kEmptyVarName || - out_name.size() < kGradVarSuffix.size()) { + out_name.size() <= kGradVarSuffix.size()) { continue; } @@ -98,6 +104,8 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { auto *in_var_desc = block->FindVarRecursive(fwd_var_name); PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", fwd_var_name); + VLOG(10) << "Infer shape of Out(" << out_name << ") as Input(" + << in_var_desc->Name() << ")"; out_var_desc->SetShape(in_var_desc->GetShape()); out_var_desc->SetDataType(in_var_desc->GetDataType()); out_var_desc->SetLoDLevel(in_var_desc->GetLoDLevel()); @@ -112,13 +120,15 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Inputs of py_func op.").AsDuplicable(); AddOutput("Out", "Outputs of py_func op").AsDuplicable(); - AddAttr("handle_idx", "Index of the registered py_func handle") + AddAttr(kForwardPythonCallableId, + "Index of registered forward Python function.") .SetDefault(0); - AddAttr("token", "Token of function token to be called") - .SetDefault(""); - AddAttr("backward_token", - "Token of backward function to be called") - .SetDefault(""); + AddAttr(kBackwardPythonCallableId, + "Index of registered backward Python function") + .SetDefault(-1); + AddAttr>(kPyFuncBackwardSkipVars, + "Unused forward in/out in backward op") + .SetDefault(std::vector()); AddComment(R"DOC("PyFunc Op")DOC"); } }; @@ -129,7 +139,8 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { std::vector> operator()() const override { auto &fwd_attrs = Attrs(); - if (fwd_attrs.at("backward_token").empty()) { + // no backward op when backward_id is less than 0 + if (boost::get(fwd_attrs.at(kBackwardPythonCallableId)) < 0) { return {}; } @@ -137,36 +148,65 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { grad_op->SetType("py_func"); framework::AttributeMap bwd_attrs; - bwd_attrs["token"] = fwd_attrs.at("backward_token"); - bwd_attrs["backward_token"] = std::string(""); + bwd_attrs[kForwardPythonCallableId] = + fwd_attrs.at(kBackwardPythonCallableId); + bwd_attrs[kBackwardPythonCallableId] = -1; grad_op->SetAttrMap(bwd_attrs); - auto bwd_in = Input("X"); - auto fwd_out = Output("Out"); - auto fwd_out_grad = OutputGrad("Out"); - bwd_in.insert(bwd_in.end(), fwd_out.begin(), fwd_out.end()); - bwd_in.insert(bwd_in.end(), fwd_out_grad.begin(), fwd_out_grad.end()); + // All forward inputs + auto fwd_ins = Input("X"); + // All forward outputs + auto fwd_outs = Output("Out"); + + // For memory reused, some inputs/output in forward part may be not needed + // in backward part + // Just skip these vars + auto &backward_skip_var_list = boost::get>( + fwd_attrs.at(kPyFuncBackwardSkipVars)); + std::unordered_set backward_skip_var_set( + backward_skip_var_list.begin(), backward_skip_var_list.end()); + std::vector bwd_ins; + bwd_ins.reserve(fwd_ins.size() + fwd_outs.size()); + for (auto &fwd_in : fwd_ins) { + if (backward_skip_var_set.count(fwd_in) == 0) { + bwd_ins.emplace_back(fwd_in); + } + } + + for (auto &fwd_out : fwd_outs) { + if (backward_skip_var_set.count(fwd_out) == 0) { + bwd_ins.emplace_back(fwd_out); + } + } + + // Backward OG cannot be skipped + // But in Python side, if OG is kEmptyVarName, input tensor would be None + auto fwd_out_grads = OutputGrad("Out"); + bwd_ins.reserve(bwd_ins.size() + fwd_out_grads.size()); + bwd_ins.insert(bwd_ins.end(), fwd_out_grads.begin(), fwd_out_grads.end()); - auto bwd_out = InputGrad("X", false); + // Backward IG cannot be skipped + // But in Python side, if IG is not needed, users can just return None + auto bwd_outs = InputGrad("X", false); if (VLOG_IS_ON(10)) { std::string in_str = "PyFunc Grad Input: "; - for (auto &in : bwd_in) { + for (auto &in : bwd_ins) { in_str += in; in_str += " "; } VLOG(10) << in_str; std::string out_str = "PyFunc Grad Output: "; - for (auto &out : bwd_out) { + for (auto &out : bwd_outs) { out_str += out; - out += " "; + out_str += " "; } VLOG(10) << out_str; } - grad_op->SetInput("X", bwd_in); - grad_op->SetOutput("Out", InputGrad("X", false)); + grad_op->SetInput("X", bwd_ins); + grad_op->SetOutput("Out", bwd_outs); std::vector> ret(1); ret[0] = std::move(grad_op); @@ -210,12 +250,11 @@ class PyFuncOp : public framework::OperatorBase { outputs[i] = out_tensor; } - auto &token = Attr("token"); - auto handle_idx = static_cast(Attr("handle_idx")); - auto *py_callable = GetPythonCallableObject(handle_idx); - VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " - << handle_idx; - CallPythonFunc(py_callable, token, inputs, &outputs); + auto callable_id = static_cast(Attr(kForwardPythonCallableId)); + auto *py_callable = GetPythonCallableObject(callable_id); + VLOG(10) << "Call py_func_op with id " << callable_id << ": " + << PythonObjectToString(*py_callable); + CallPythonFunc(py_callable, inputs, &outputs); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 66c98c935d7..95f046c614e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9087,104 +9087,140 @@ def get_tensor_from_selected_rows(x, name=None): return out -@templatedoc() -def py_func(func, x, out, backward_func=None): - """ - """ - - class PyFuncRegister(object): - _main_program_to_register = dict() - - @classmethod - def get_instance(cls, prog): - if not isinstance(prog, Program): - raise TypeError("prog must be type of Program") - - ret = cls._main_program_to_register.get(prog, None) - if ret is None: - ret = PyFuncRegister() - ret._idx = core.append_python_callable_object_and_return_id(ret) - ret._token_func_dict = dict() - ret._func_token_dict = dict() - cls._main_program_to_register[prog] = ret - - return ret - - @property - def handle_idx(self): - return self._idx - - def unique_token(self, func): - return self._register_func(func) - - def _register_func(self, func): - if func is None: - raise ValueError("func cannot be None") - - token = self._func_token_dict.get(func, None) - if token is not None: - return token - - token = unique_name.generate('py_func_op_token') - self._token_func_dict[token] = func - self._func_token_dict[func] = token - return token - - def __call__(self, token, *args): - func = self._token_func_dict.get(token, None) - if func is None: - raise ValueError("func has not been registered") - - arg_list = inspect.getargspec(func) - kwargs = dict() - idx = 0 - for arg in arg_list[0]: - kwargs[arg] = args[idx] - idx += 1 - - args = args[idx:] - ret0 = func(*args, **kwargs) - if ret0 is None: - return None - - if not isinstance(ret0, (list, tuple)): - ret0 = (ret0, ) - - ret = [] - for i in six.moves.range(len(ret0)): - if ret0[i] is None: - ret.append(None) - continue - - if isinstance(ret0[i], core.LoDTensor): - ret.append(ret0[i]) - continue +class PyFuncWrapper(object): + _register_funcs = [] + + def __init__(self, func): + if func is None or not hasattr(func, '__call__'): + raise TypeError('func must be a Python function') + + self._func = func + # find named args using reflection + self._named_args = inspect.getargspec(self._func)[0] + self._id = core.append_python_callable_object_and_return_id(self) + ''' + Why record self here? + + 1. For debug usage. Users can call + :code:`py_func.registered_func(idx)` method + to find the registered function coresponding + to :code:`idx`. + + 2. For increasing reference count of self. + It seems that to release Python object + whose reference count is 1 would cause + segmentation fault error in C++ side. + May be lack of Python GC in C++ side? + ''' + PyFuncWrapper._register_funcs.append(self) + + @classmethod + def registered_func(cls, idx): + return cls._register_funcs[idx]._func + + @classmethod + def registered_func_num(cls): + return len(cls._register_funcs) + + @property + def id(self): + return self._id + + def __call__(self, *args): + kwargs = dict() + idx = 0 + for arg in self._named_args: + kwargs[arg] = args[idx] + idx += 1 + + ret0 = self._func(*args[idx:], **kwargs) + if ret0 is None: + return None + + if not isinstance(ret0, (list, tuple)): + ret0 = (ret0, ) + + ret = [] + for i in six.moves.range(len(ret0)): + if ret0[i] is None: + ret.append(None) + continue + + if isinstance(ret0[i], core.LoDTensor): + ret.append(ret0[i]) + continue + + if isinstance(ret0[i], np.ndarray): + r = ret0[i] + else: + r = np.array(ret0[i]) - if isinstance(ret0[i], np.ndarray): - r = ret0[i] - else: - r = np.array(ret0[i]) + t = core.LoDTensor() + t.set(r, core.CPUPlace()) + ret.append(t) - t = core.LoDTensor() - t.set(r, core.CPUPlace()) - ret.append(t) + return tuple(ret) - return tuple(ret) +@templatedoc() +def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): + """ + PyFunc Operator. + + User can use :code:`py_func` to register operators in Python side. + The inputs of :code:`func` is :code:`LoDTensor` and outputs can be + numpy array or :code:`LoDTensor`. Paddle would call the registered + :code:`func` in forward part, and call :code:`backward_func` in + backward part (if :code:`backward_func` is not None). + + User should set the right data type and shape of :code:`out` before + calling this function. However, data types and shapes of gradients of + :code:`out` and :code:`x` would be infered automatically. + + The orders of inputs of :code:`backward_func` would be: forward input + :code:`x`, forward output :code:`out` and backward input gradient of + :code:`out`. If some variables of :code:`out` have no gradient, the input + tensor would be None in Python side. If some variables of :code:`in` have + no gradient, users should return None. + + Args: + func (callable): forward Python function. + x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. + out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`. + Paddle cannot infer shapes and data types of :code:`out`. Users + should create :code:`out` beforehand. + backward_func (callable|None): backward Python function. + None means no backward. Default None. + skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)): + Variables that are not needed in :code:`backward_func` inputs. + These variables must be any of :code:`x` and :code:`out`. + If set, these vars would not be inputs of :code:`backward_func`, + Only useful when :code:`backward_func` is not None. Default None. + + Returns: + out (Variable|list(Variable)|tuple(Variable)): input :code:`out` + """ helper = LayerHelper('py_func', **locals()) - if isinstance(x, Variable): + if x is None: + x = [] + elif isinstance(x, Variable): x = [x] + elif not isinstance(x, (list, tuple)): + raise TypeError('Input must be Variable/list(Variable)/tuple(Variable)') - if isinstance(out, Variable): + if out is None: + out_list = [] + elif isinstance(out, Variable): out_list = [out] - else: + elif isinstance(out, (list, tuple)): out_list = out + else: + raise TypeError( + 'Output must be Variable/list(Variable)/tuple(Variable)') - if func is None or not hasattr(func, '__call__'): - raise TypeError('Input func must be a function') - - if backward_func is not None and not hasattr(backward_func, '__call__'): - raise TypeError('Input backward_func must be a function') + fwd_func_id = PyFuncWrapper(func).id + bwd_func_id = PyFuncWrapper( + backward_func).id if backward_func is not None else -1 for each_out in out_list: if len(each_out.shape) == 0: @@ -9192,18 +9228,34 @@ def py_func(func, x, out, backward_func=None): 'Output shapes of py_func op should be provided by users manually' ) - py_func_reg = PyFuncRegister.get_instance(helper.main_program) - forward_token = py_func_reg.unique_token(func) - backward_token = py_func_reg.unique_token( - backward_func) if backward_func is not None else '' + backward_skip_vars = set() + if backward_func is not None and skip_vars_in_backward_input is not None: + if isinstance(skip_vars_in_backward_input, Variable): + skip_vars_in_backward_input = [skip_vars_in_backward_input] + + fwd_in_out = [v.name for v in x] + fwd_in_out.extend([v.name for v in out_list]) + fwd_in_out = set(fwd_in_out) + backward_skip_vars = set() + for v in skip_vars_in_backward_input: + if not v.name in fwd_in_out: + raise ValueError( + 'Variable {} is not found in forward inputs and outputs' + .format(v.name)) + backward_skip_vars.add(v.name) helper.append_op( type='py_func', inputs={'X': x}, outputs={'Out': out_list}, attrs={ - 'handle_idx': py_func_reg.handle_idx, - 'token': forward_token, - 'backward_token': backward_token + 'forward_callable_id': fwd_func_id, + 'backward_callable_id': bwd_func_id, + 'backward_skip_vars': list(backward_skip_vars) }) return out + + +# For debug usage +py_func.registered_func = PyFuncWrapper.registered_func +py_func.registered_func_num = PyFuncWrapper.registered_func_num diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py new file mode 100644 index 00000000000..0f03368b7e2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -0,0 +1,145 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle +import unittest +import six +import numpy as np + + +def tanh(x): + return np.tanh(x) + + +def tanh_grad(y, dy): + return np.array(dy) * (1 - np.square(np.array(y))) + + +def cross_entropy(logits, labels): + logits = np.array(logits) + labels = np.array(labels) + M = logits.shape[0] + N = logits.shape[1] + ret = np.ndarray([M, 1]).astype(logits.dtype) + for idx in six.moves.range(M): + ret[idx][0] = -np.log(logits[idx][labels[idx][0]]) + return ret + + +def cross_entropy_grad(logits, labels, bwd_dout): + logits = np.array(logits) + labels = np.array(labels) + bwd_dout = np.array(bwd_dout) + M = logits.shape[0] + N = logits.shape[1] + dlogits = np.zeros([M, N]).astype(logits.dtype) + for idx in six.moves.range(M): + dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][ + 0]] + return dlogits, None + + +def simple_fc_net(img, label, use_py_func_op): + hidden = img + for idx in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + if use_py_func_op: + hidden = fluid.layers.tanh(hidden) + else: + new_hidden = fluid.default_main_program().current_block( + ).create_var( + name='hidden_{}'.format(idx), + dtype='float32', + shape=hidden.shape) + hidden = fluid.layers.py_func( + func=tanh, + x=hidden, + out=new_hidden, + backward_func=tanh_grad, + skip_vars_in_backward_input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + if not use_py_func_op: + loss = fluid.layers.cross_entropy(input=prediction, label=label) + else: + loss = fluid.default_main_program().current_block().create_var( + name='loss', dtype='float32', shape=[-1, 1]) + fluid.layers.py_func( + func=cross_entropy, + x=[prediction, label], + out=loss, + backward_func=cross_entropy_grad, + skip_vars_in_backward_input=loss) + loss = fluid.layers.mean(loss) + return loss + + +def reader(): + for _ in six.moves.range(100): + yield np.random.random([784]), np.random.random_integers( + size=[1], low=0, high=9) + + +def test_main(use_cuda, use_py_func_op): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return None + + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.core.Scope()): + fluid.default_main_program().random_seed = 1 + fluid.default_startup_program().random_seed = 1 + np.random.seed(1) + + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + loss = simple_fc_net(img, label, use_py_func_op) + optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + r = paddle.batch(reader, batch_size=10) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + ret = [] + for epoch_id in six.moves.range(2): + for d in r(): + L, = exe.run(feed=feeder.feed(d), fetch_list=[loss]) + ret.append(L[0]) + + return np.array(ret) + + +class TestPyFuncOp(unittest.TestCase): + def test_loss_diff(self): + losses = [] + for use_cuda in [True, False]: + for use_py_func_op in [True, False]: + L = test_main(use_cuda, use_py_func_op) + if L is not None: + losses.append(L) + + for idx in six.moves.range(len(losses) - 1): + max_diff = np.max(np.abs(losses[idx] - losses[0])) + self.assertAlmostEqual(max_diff, 0, delta=1e-3) + + +if __name__ == '__main__': + unittest.main() -- GitLab From e7c5c9d2de9f51e5d403c879a31c0297e0f40656 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 12:41:47 +0000 Subject: [PATCH 0170/2367] remove unnecesary code test=develop --- paddle/fluid/pybind/pybind.cc | 14 -------------- python/paddle/fluid/layers/nn.py | 2 +- .../fluid/tests/unittests/test_py_func_op.py | 2 +- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 38b1308330c..348a0739152 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -572,20 +572,6 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Place") .def(py::init<>()) - .def("is_cpu_place", - [](platform::Place &self) { return platform::is_cpu_place(self); }) - .def("is_gpu_place", - [](platform::Place &self) { return platform::is_gpu_place(self); }) - .def("is_cuda_pinned_place", - [](platform::Place &self) { - return platform::is_cuda_pinned_place(self); - }) - .def("gpu_device_id", - [](platform::Place &self) { - PADDLE_ENFORCE(platform::is_gpu_place(self), - "gpu_device_id() only supports in CUDAPlace"); - return boost::get(self).device; - }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 65550250014..d71368644d8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,7 +23,7 @@ import os import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant -from ..framework import Variable, OpProtoHolder, Program +from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 0f03368b7e2..c71f2bdea83 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -59,7 +59,7 @@ def simple_fc_net(img, label, use_py_func_op): size=200, bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1.0))) - if use_py_func_op: + if not use_py_func_op: hidden = fluid.layers.tanh(hidden) else: new_hidden = fluid.default_main_program().current_block( -- GitLab From 582011ba76baab02dcc9fbaf3536d781ba59dc3b Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 12 Dec 2018 21:08:02 +0800 Subject: [PATCH 0171/2367] Add L2 unit test (#14792) * add l2 unit test test=develop * code refine test=develop --- .../fluid/tests/unittests/test_regularizer.py | 136 +++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index 20f91cf4485..62994eec7e7 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -15,7 +15,12 @@ from __future__ import print_function import unittest - +from functools import partial +import contextlib +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer import paddle.fluid.regularizer as regularizer @@ -97,5 +102,134 @@ class TestL1DecayRegularizer(unittest.TestCase): self.assertEqual(block.ops[-3].type, 'sign') +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestRegularizer(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=8)() + self.train_data = [next(reader) for _ in range(5)] + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + @contextlib.contextmanager + def scope_prog_guard(self, main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + def run_program(self, place, feed_list): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + main_prog = fluid.default_main_program() + param_list = [var.name for var in main_prog.block(0).all_parameters()] + + param_sum = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=param_list) + p_sum = 0 + for v in out: + p_sum += np.sum(np.abs(v)) + param_sum.append(p_sum) + return param_sum + + def check_l2decay_regularizer(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + optimizer = fluid.optimizer.Adagrad( + learning_rate=0.1, + regularization=fluid.regularizer.L2Decay(1.0)) + optimizer.minimize(avg_cost) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def check_l2decay(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost_l2 = model(data, label, len(self.word_dict)) + + param_list = fluid.default_main_program().block(0).all_parameters() + para_sum = [] + for para in param_list: + para_mul = fluid.layers.square(x=para) + para_sum.append(fluid.layers.reduce_sum(input=para_mul)) + avg_cost_l2 += fluid.layers.sums(para_sum) * .5 + + optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) + optimizer.minimize(avg_cost_l2) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def test_l2(self): + for place in self.get_places(): + dense_sparse_p_sum = [] + for sparse in [True, False]: + model = partial(bow_net, is_sparse=sparse) + framework_l2 = self.check_l2decay_regularizer(place, model) + l2 = self.check_l2decay(place, model) + assert len(l2) == len(framework_l2) + for i in range(len(l2)): + assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5) + dense_sparse_p_sum.append(framework_l2) + + assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1]) + for i in range(len(dense_sparse_p_sum[0])): + assert np.isclose( + a=dense_sparse_p_sum[0][i], + b=dense_sparse_p_sum[1][i], + rtol=5e-5) + + if __name__ == '__main__': unittest.main() -- GitLab From 7bd16e3afad479a559fba9321581def2c5d90165 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 21:47:51 +0800 Subject: [PATCH 0172/2367] fix some bug & add log --- paddle/fluid/framework/async_executor.cc | 2 +- .../fluid/framework/executor_thread_worker.cc | 28 +++++++++++++------ .../fluid/framework/executor_thread_worker.h | 2 +- python/paddle/fluid/async_executor.py | 3 +- .../paddle/fluid/contrib/utils/hdfs_utils.py | 5 +++- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index f0ca375f950..6efe5cafe72 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -111,7 +111,7 @@ void AsyncExecutor::InitParamConfig() { std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_value(i)] = table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index a0455b26efd..7004ecf23b0 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -330,6 +330,7 @@ void AsyncExecutorThreadWorker::TrainFiles() { print_fetch_var(thread_scope_, fetch_var_names_[i]); } // end for (int i = 0...) } // end while () + LOG(ERROR) << "TRAIN DONE"; } void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { @@ -571,25 +572,30 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { void AsyncExecutorThreadWorker::PushSparse(int table_id) { auto slot_dim = _param_config->slot_dim; //TODO auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO - auto& features = _features[table_id]; + auto& features = _features[table_id]; + CHECK(features.size() < 1000000) << "features size:" << features.size(); //std::vector gradient_var; //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO - auto& push_g = _feature_push_value[table_id]; + auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); uint64_t fea_idx = 0u; - auto& fea_info = _fea_info[table_id]; //TODO + auto& fea_info = _fea_info[table_id]; int offset = 0; //if (!_current_train_job.use_cvm_feature()) { //TODO offset = 2; //} - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + LOG(ERROR) << "ERROR continue"; continue; } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; LoDTensor* g_tensor = g_var->GetMutable(); if (g_tensor == NULL) { LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; @@ -598,13 +604,16 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { float* g = g_tensor->data(); Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; LoDTensor* tensor = var->GetMutable(); if (tensor == NULL) { LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; exit(-1); } - int len = tensor->lod()[0].back(); - assert(slot_dim * len == g_tensor->numel()); + //int len = tensor->lod()[0].back(); + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx){ if (ids[id_idx] == 0) { @@ -613,12 +622,13 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { } memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx << " size:" << fea_info.size(); push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); g += slot_dim; fea_idx++; } } - assert(fea_idx == features.size()); + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx << " features size:" << features.size(); CHECK(features.size() > 0); std::vector push_g_vec; diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index b3ee9dfaec9..0c9a47690be 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -49,7 +49,7 @@ struct AsyncWorkerParamConfig { std::vector sparse_table_id; std::map> slot_input_vec; //6048slot 6050slot //name std::map> gradient_var; //6048slot_embed - std::unordered_map slot_alias_to_table; //TODO done + std::map slot_alias_to_table; //TODO done }; struct DensePullThreadParam { diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index af42d2912fd..13d876e57be 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -153,7 +153,7 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, hadoop_home="$HADOOP_HOME", process_num=12): + def download_data(self, afs_path, local_path, fs_default_name, ugi, file_cnt, hadoop_home="$HADOOP_HOME", process_num=12): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') @@ -169,6 +169,7 @@ class AsyncExecutor(object): local_path, self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, + file_cnt, multi_processes=process_num) #self.instance.barrier_all() #wait for download_data #TODO only barriere worker self.instance.barrier_worker() #wait for download_data #TODO only barriere worker diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index ff1a2d3e4ad..42b4d7feab6 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -427,6 +427,7 @@ def multi_download(client, local_path, trainer_id, trainers, + file_cnt, multi_processes=5): """ multi_download @@ -435,6 +436,7 @@ def multi_download(client, :param local_path: path on local :param trainer_id: current trainer id :param trainers: all trainers number + :param file_cnt: all file number :param multi_processes: the download data process at the same time, default=5 :return: None """ @@ -450,7 +452,7 @@ def multi_download(client, client.make_local_dirs(local_path) _logger.info("Make local dir {} successfully".format(local_path)) - all_need_download = client.lsr(hdfs_path, sort=True) + all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt] need_download = all_need_download[trainer_id::trainers] _logger.info("Get {} files From all {} files need to be download from {}". format(len(need_download), len(all_need_download), hdfs_path)) @@ -501,6 +503,7 @@ if __name__ == "__main__": "/home/xx/data1", 1, 5, + 100, multi_processes=5) multi_upload(client, "/user/com/train-25/model", "/home/xx/data1") -- GitLab From e9216e82f915f0fdd96cc539040c7c75a2d71113 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 13:55:27 +0000 Subject: [PATCH 0173/2367] add refer vscal, vaddbias and test and benchmark --- paddle/fluid/operators/jit/README.md | 8 +- paddle/fluid/operators/jit/benchmark.cc | 89 +++++++++++++-- paddle/fluid/operators/jit/helper.cc | 4 + paddle/fluid/operators/jit/kernel_base.h | 13 ++- .../fluid/operators/jit/refer/CMakeLists.txt | 5 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 12 ++ paddle/fluid/operators/jit/test.cc | 103 ++++++++++++++++-- .../fluid/operators/math/jit_kernel_refer.h | 7 -- 9 files changed, 216 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index c2e32cc49b2..2d72aa4d569 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -37,10 +37,12 @@ PaddlePaddle/Paddle/paddle/fluid/ ## 测试 - 逻辑测试 - 所有实现都要与refer的code对比,需要满足精度要求 + 所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型 - 性能测试 + 所有实现的性能对比,并且与最终的`jit::Get`方法对比,该方法拿到的性能需要是最好的。 # 如何添加新的算子 -- 在`KernelType` 中添加 `your_key` -- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)` +- 在`KernelType` 中添加 `your_key` . +- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. +- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`. diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 27a1ba7ba32..2ad87e414bd 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -53,9 +53,9 @@ std::vector TestSizes() { // return this function avg time template -double BenchTartgetFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - std::vector& z) { // NOLINT +double BenchXYZNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + std::vector& z) { // NOLINT const T* x_data = x.data(); const T* y_data = y.data(); const int d = z.size(); @@ -83,14 +83,14 @@ void BenchXYZNKernel() { // refer auto refer = jit::GetRefer>(); if (refer) { - auto res = BenchTartgetFunc>(refer, x, y, z); + auto res = BenchXYZNFunc>(refer, x, y, z); infos.push_back(std::make_pair("Refer", res)); } // test jitcode auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - auto res = BenchTartgetFunc>(jitcode, x, y, z); + auto res = BenchXYZNFunc>(jitcode, x, y, z); infos.push_back(std::make_pair("JitCode", res)); } @@ -105,7 +105,7 @@ void BenchXYZNKernel() { impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); - auto res = BenchTartgetFunc>(more, x, y, z); + auto res = BenchXYZNFunc>(more, x, y, z); infos.push_back(std::make_pair("More", res)); } } @@ -116,7 +116,7 @@ void BenchXYZNKernel() { if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } - auto res = BenchTartgetFunc>(tgt, x, y, z); + auto res = BenchXYZNFunc>(tgt, x, y, z); infos.push_back(std::make_pair("Target", res)); // print @@ -129,6 +129,78 @@ void BenchXYZNKernel() { } } +// return this function avg time +template +double BenchAXYNFunc(const typename KernelTuples::func_type tgt, const T a, + const std::vector& x, + std::vector& y) { // NOLINT + const T* x_data = x.data(); + T* y_data = y.data(); + const int d = y.size(); + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(&a, x_data, y_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(&a, x_data, y_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchAXYNKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + std::vector> infos; + const T a = static_cast(3); + std::vector x(d), y(d); + RandomVec(d, x.data()); + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchAXYNFunc>(refer, a, x, y); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + auto res = BenchAXYNFunc>(jitcode, a, x, y); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = BenchAXYNFunc>(more, a, x, y); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchAXYNFunc>(tgt, a, x, y); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -147,4 +219,7 @@ int main(int argc, char* argv[]) { BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); + + BenchAXYNKernel(); + BenchAXYNKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 2260f0aed42..c9aaffb8b8d 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/helper.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -32,7 +33,10 @@ const char* to_string(KernelType kt) { return "vscal"; case vexp: return "vexp"; + case vaddbias: + return "vaddbias"; default: + PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; } return nullptr; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index b2e9d639776..74ecf3dade5 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -19,7 +19,15 @@ namespace paddle { namespace operators { namespace jit { -typedef enum { vmul = 0, vadd = 1, vaddrelu, vsub, vscal, vexp } KernelType; +typedef enum { + vmul = 0, + vadd = 1, + vaddrelu, + vsub, + vscal, + vaddbias, + vexp +} KernelType; template struct XYZNTuples { @@ -28,6 +36,9 @@ struct XYZNTuples { typedef void (*func_type)(const T*, const T*, T*, int); }; +template +struct AXYNTuples : public XYZNTuples {}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index b6ff80d03df..afe3f6ca0f4 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -8,3 +8,8 @@ endfunction() # use refer kernel by name USE_JITKERNEL_REFER(vmul) +USE_JITKERNEL_REFER(vadd) +USE_JITKERNEL_REFER(vaddrelu) +USE_JITKERNEL_REFER(vsub) +USE_JITKERNEL_REFER(vscal) +USE_JITKERNEL_REFER(vaddbias) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 69d039422f3..4e9c530344b 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -26,4 +26,7 @@ REGISTER_REFER_KERNEL(vadd, VAdd); REGISTER_REFER_KERNEL(vaddrelu, VAddRelu); REGISTER_REFER_KERNEL(vsub, VSub); +REGISTER_REFER_KERNEL(vscal, VScal); +REGISTER_REFER_KERNEL(vaddbias, VAddBias); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 4d4d308cbd1..32ac5bf2d78 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -59,6 +59,13 @@ void VScal(const T* a, const T* x, T* y, int n) { } } +template +void VAddBias(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -66,11 +73,16 @@ void VScal(const T* a, const T* x, T* y, int n) { name##Kernel() { this->func = name; } \ } +// const T* x, const T* y, T* z, int n DECLARE_REFER_KERNEL(VMul, XYZNTuples); DECLARE_REFER_KERNEL(VAdd, XYZNTuples); DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); DECLARE_REFER_KERNEL(VSub, XYZNTuples); +// const T* a, const T* x, T* y, int n +DECLARE_REFER_KERNEL(VScal, AXYNTuples); +DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 9ceca24079f..ea2cb7b7a42 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -12,7 +12,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include // for memcpy #include #include #include @@ -59,9 +58,9 @@ std::vector TestSizes() { } template -void TestTartgetFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { +void TestXYZNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(zref.size(), x.size()); EXPECT_EQ(zref.size(), y.size()); @@ -88,9 +87,8 @@ void TestTartgetFunc(const typename KernelTuples::func_type tgt, template void TestXYZNKernel() { namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT) - << ", size: " << d; auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); @@ -119,7 +117,7 @@ void TestXYZNKernel() { auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestTartgetFunc>(jitcode, x, y, zref); + TestXYZNFunc>(jitcode, x, y, zref); } // test all impls in more @@ -134,14 +132,14 @@ void TestXYZNKernel() { if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; - TestTartgetFunc>(more, x, y, zref); + TestXYZNFunc>(more, x, y, zref); } } } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; auto tgt = jit::Get, PlaceType>(d); - TestTartgetFunc>(tgt, x, y, zref); + TestXYZNFunc>(tgt, x, y, zref); } } @@ -169,4 +167,89 @@ TEST(JITKernel, vsub) { TestXYZNKernel(); } -TEST(JITKernel, pool) {} +template +void TestAXYNFunc(const typename KernelTuples::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); +} + +template +void TestAXYNKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + + const T a = static_cast(3); + std::vector x(d), yref(d); + std::vector xinp(d); // inplace test + RandomVec(d, x.data()); + std::copy(x.begin(), x.end(), xinp.begin()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* xinp_data = xinp.data(); + // test refer code inplace + ref(&a, x_data, yref_data, d); + ref(&a, xinp_data, xinp_data, d); + ExpectEQ(xinp_data, yref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestAXYNFunc>(jitcode, a, x, yref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestAXYNFunc>(more, a, x, yref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get, PlaceType>(d); + TestAXYNFunc>(tgt, a, x, yref); + } +} + +TEST(JITKernel, vscal) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, vaddbias) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, pool) { + // TODO(TJ): add some test +} diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index eaca02ba147..b5ee07e7488 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,13 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void VAddBias(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - template void VRelu(const T* x, T* y, int n) { for (int i = 0; i < n; ++i) { -- GitLab From 06930531887547286f3c4ad096d1fd0794749867 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 11 Dec 2018 14:43:52 +0800 Subject: [PATCH 0174/2367] add liscence --- python/paddle/fluid/async_executor.py | 3 ++- python/paddle/fluid/distributed/downpour.py | 13 ++++++++++ python/paddle/fluid/distributed/helper.py | 14 +++++++++++ python/paddle/fluid/distributed/node.py | 13 ++++++++++ .../paddle/fluid/distributed/ps_instance.py | 24 ++++++++++--------- 5 files changed, 55 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 13d876e57be..099805ac1bd 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -76,7 +76,7 @@ class AsyncExecutor(object): Note: Only running on CPUPlace supported. """ - def __init__(self, place=None): + def __init__(self, place=None, run_mode=""): if place is None: place = core.CPUPlace() if not isinstance(place, core.CPUPlace): @@ -89,6 +89,7 @@ class AsyncExecutor(object): self.executor = core.AsyncExecutor(scope, p) self.instance = None + def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index c1762dd7688..9ef9e14ccc5 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -1,3 +1,16 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + from .node import DownpourServer from .node import DownpourWorker from ..backward import append_backward diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 1244b4c0cad..986525e5d85 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from mpi4py import MPI import ps_pb2 as pslib diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 1f4aeeac738..87553230060 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -1,3 +1,16 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + import ps_pb2 as pslib class Server(object): diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index dce5dfc5bd6..b93da053a36 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -1,8 +1,18 @@ -#import paddle.fluid.distributed.helper as dist_helper +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + import helper as dist_helper import sys -#from mpi4py import MPI - class PaddlePSInstance(object): def __init__(self, server_worker_mode, proc_per_node): @@ -83,17 +93,11 @@ class PaddlePSInstance(object): return self._nodes def barrier_all(self): - #print self._rankid, "begin" - #sys.stdout.flush() self.dh.comm.barrier() - #print self._rankid, "end" def barrier_worker(self): if self.is_worker(): - #print "worker: ", self._rankid, "begin" - #sys.stdout.flush() self._comm.barrier() - #print "worker: ", self._rankid, "end" pass def finalize(self): @@ -104,5 +108,3 @@ class PaddlePSInstance(object): if __name__ == "__main__": instance = PaddlePSInstance(1, 1, 2, 50) instance.barrier_all() - #print "-----" - #instance.barrier_worker() -- GitLab From 33ee5cad61383db6bc06681f9f1afa76492a5759 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 10:33:06 +0800 Subject: [PATCH 0175/2367] format code style of executor_thread_worker.cc --- .../fluid/framework/executor_thread_worker.cc | 77 +++++++++---------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 7004ecf23b0..86ac93be3e4 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -417,48 +417,46 @@ void AsyncExecutorThreadWorker::PrepareParams() { } void AsyncExecutorThreadWorker::UpdateParams() { - for (auto i: _param_config->sparse_table_id) {//TODO - //for (int i = 0; i < 1; ++i) { - PushSparse(i); - } - //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (auto i: _param_config->dense_table_id) { - PushDense(i); - } - int32_t tmp_push_dense_wait_times = -1;//_param_config->tmp_push_dense_wait_times; //TODO - int32_t tmp_push_sparse_wait_times = -1;//_param_config->tmp_push_sparse_wait_times; //TODO - static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); - static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); - - if (_push_dense_status.size() >= push_dense_wait_times) { - for (auto& t : _push_dense_status) { - t.wait(); - } - _push_dense_status.resize(0); - } - if (tmp_push_dense_wait_times == -1) { - _push_dense_status.resize(0); - } - if (_push_sparse_status.size() >= push_sparse_wait_times) { - for (auto& t : _push_sparse_status) { - t.wait(); - } - _push_sparse_status.resize(0); - } - if (tmp_push_sparse_wait_times == -1) { - _push_sparse_status.resize(0); - } - //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO - for (auto dense_table_id: _param_config->dense_table_id) { - _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + for (auto i : _param_config->sparse_table_id) { + PushSparse(i); + } + for (auto i : _param_config->dense_table_id) { + PushDense(i); + } + // _param_config->tmp_push_dense_wait_times + int32_t tmp_push_dense_wait_times = -1; + // _param_config->tmp_push_sparse_wait_times + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); } - //} + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + for (auto dense_table_id : _param_config->dense_table_id) { + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } } void AsyncExecutorThreadWorker::PushDense(int table_id) { std::vector regions; - //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; - //std::vector variables; for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { Variable* var = thread_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; @@ -469,7 +467,8 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { regions.emplace_back(std::move(reg)); } - auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), regions.size(), table_id); + auto status = _pslib_ptr->_worker_ptr->push_dense( + regions.data(), regions.size(), table_id); _push_dense_status.push_back(std::move(status)); } @@ -478,7 +477,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; - auto fea_dim = _param_config->fea_dim; //TODO + auto fea_dim = _param_config->fea_dim; // slot id starts from 1 features.clear(); features.resize(0); -- GitLab From 162637b64abd39c3ca7c75c08690169968305712 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 10:56:08 +0800 Subject: [PATCH 0176/2367] Fix ngraph compile test=develop --- paddle/fluid/framework/ngraph_operator.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 253de4c6116..e2cdfc845fe 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -471,27 +471,23 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type().hash_code() == - typeid(float).hash_code()) { // NOLINT + if (tensor_pd->type() == proto::VarType::FP32) { const float* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(int).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::INT32) { const int* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + } else if (tensor_pd->type() == proto::VarType::INT64) { const int64_t* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(double).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::FP64) { const double* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(bool).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::BOOL) { const bool* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::boolean, sp, const_cast(arr)); -- GitLab From c71279bc697a101b9afe74a1e19fc9fb99195bd9 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 11:23:16 +0800 Subject: [PATCH 0177/2367] refine code style for async_executor.h and async_executor.cc --- paddle/fluid/framework/async_executor.cc | 101 ++++++++++++++--------- paddle/fluid/framework/async_executor.h | 25 +++--- 2 files changed, 79 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 6efe5cafe72..c62d62a5dc4 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -66,15 +66,20 @@ void PrepareReaders(std::vector>& readers, // NOLINT } void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { - _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_server(dist_desc, index);//TODO done - + _pslib_ptr = + std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); InitParamConfig(); } -void AsyncExecutor::InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { - _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_worker(dist_desc, host_sign_list.data(), node_num, index);//TODO done +void AsyncExecutor::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker( + dist_desc, host_sign_list.data(), node_num, index); InitParamConfig(); } @@ -87,43 +92,65 @@ void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } -void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { +void AsyncExecutor::GatherServers( + std::vector& host_sign_list, int node_num) { _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param_size(); ++i) { - if (_pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).accessor().fea_dim(); //TODO + for (int i = 0; i < + _pslib_ptr->get_param()->server_param().\ + downpour_server_param().\ + downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param()->server_param().\ + downpour_server_param().downpour_table_param(i).\ + table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param().\ + downpour_server_param().\ + downpour_table_param(i).\ + accessor().fea_dim(); break; } } - _param_config.slot_dim = _param_config.fea_dim - 2; //TODO - _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { - _param_config.skip_op.push_back(_pslib_ptr->get_param()->trainer_param().skip_op(t)); + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); } - //sparse - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); + ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = + table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { tmp_sparse_gradient_variable_name.push_back( table.slot_gradient(i)); } - _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); - _param_config.gradient_var[table.table_id()] = std::move(tmp_sparse_gradient_variable_name); + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); _param_config.sparse_table_id.push_back(table.table_id()); } - //dense - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); + ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); std::vector tmp_dense_variable_name; for (int i = 0u; i < table.dense_variable_name_size(); ++i) { @@ -134,20 +161,18 @@ void AsyncExecutor::InitParamConfig() { tmp_dense_gradient_variable_name.push_back( table.dense_gradient_variable_name(i)); } - _param_config.dense_variable_name[table.table_id()] = std::move(tmp_dense_variable_name); - _param_config.dense_gradient_variable_name[table.table_id()] = std::move(tmp_dense_gradient_variable_name); + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); _param_config.dense_table_id.push_back(table.table_id()); - _param_config.dense_table_size.push_back(table.fea_dim()); //TODO + _param_config.dense_table_size.push_back(table.fea_dim()); } } void AsyncExecutor::InitModel() { - //TODO only rank = 0 do this - //std::vector all_dense_table_id; //TODO - //all_dense_table_id.push_back(0); //done - for (auto table_id: _param_config.dense_table_id) { + for (auto table_id : _param_config.dense_table_id) { std::vector regions; - //std::vector variables; //TODO for (auto& t : _param_config.dense_variable_name[table_id]) { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; @@ -169,13 +194,15 @@ void AsyncExecutor::InitModel() { regions.emplace_back(std::move(reg)); } - auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(regions.data(), regions.size(), table_id); + auto push_status = + _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); push_status.wait(); auto status = push_status.get(); if (status != 0) { LOG(FATAL) << "push dense param failed, status[" << status << "]"; exit(-1); - } + } } } @@ -185,7 +212,7 @@ void AsyncExecutor::SaveModel(const std::string& path) { ret = _pslib_ptr->_worker_ptr->save(path, 0); ret.wait(); int32_t feasign_cnt = ret.get(); - if (feasign_cnt == -1) { // TODO should be feasign_cnt < 0, because server bug + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 LOG(FATAL) << "save model failed"; exit(-1); } @@ -195,13 +222,13 @@ void AsyncExecutor::PrepareDenseThread(const std::string& mode) { if (mode == "mpi") { DensePullThreadParam param; param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.threshold = 1; param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; - //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO param.dense_params = &_param_config.dense_variable_name; - _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread = std::shared_ptr( + new DensePullThread(param)); _pull_dense_thread->start(); } } diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 93010f8a9b0..184566dd39e 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include // NOLINT @@ -22,8 +23,7 @@ limitations under the License. */ #include // NOLINT #include #include -#include //local_random_engine -#include //local_random_engine +#include // local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -43,9 +43,10 @@ inline std::default_random_engine& local_random_engine() { struct engine_wrapper_t { std::default_random_engine engine; engine_wrapper_t() { - static std::atomic x(0); - std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; - engine.seed(sseq); + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); } }; thread_local engine_wrapper_t r; @@ -61,18 +62,20 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const std::string& mode, + const std::string& mode, const bool debug = false); - //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); void InitServer(const std::string& dist_desc, int index); - void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); - //void ConfigWorker() {} + void InitWorker( + const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index); uint64_t StartServer(); void StopServer(); - void GatherServers(std::vector& host_sign_list, int node_num); + void GatherServers(const std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); + private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -81,6 +84,7 @@ class AsyncExecutor { Scope* root_scope, const int thread_index, const bool debug); void PrepareDenseThread(const std::string& mode); + public: std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; @@ -88,6 +92,7 @@ class AsyncExecutor { platform::Place place_; AsyncWorkerParamConfig _param_config; + private: int actual_thread_num; -- GitLab From 3c01cdeff0e11108f816b5f1abe5d71b3e8d153f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 11:33:21 +0800 Subject: [PATCH 0178/2367] refine executor_thread_worker.cc & executor_thread_worker.h code style --- .../fluid/framework/executor_thread_worker.cc | 86 ++++--------------- .../fluid/framework/executor_thread_worker.h | 2 - 2 files changed, 15 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 86ac93be3e4..592a416d6dc 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -303,7 +303,7 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } -//AsyncExecutor +// AsyncExecutor void AsyncExecutorThreadWorker::TrainFiles() { SetDevice(); @@ -330,7 +330,6 @@ void AsyncExecutorThreadWorker::TrainFiles() { print_fetch_var(thread_scope_, fetch_var_names_[i]); } // end for (int i = 0...) } // end while () - LOG(ERROR) << "TRAIN DONE"; } void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { @@ -360,44 +359,12 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { UpdateParams(); } -void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { - /* - std::vector ins_slot_offset(batch_size + 1, 0); - for (auto i = 1u; i <= batch_size; ++i) { - ins_slot_offset[i] += ins_slot_offset[i - 1] + slot_dim; - } - - std::vector tensor_lod(batch_size + 1, 0); - for (auto i = 1u; i <= batch_size; ++i) { - tensor_lod[i] += tensor_lod[i - 1] + 1; - } - - auto& used_slots = reader->get_use_slot_alias(); - slot_input_vec.resize(used_slots.size() - 1); - for (auto slot_idx = 1u; slot_idx < used_slots.size(); ++slot_idx) { - auto var = slot_input_variable_name[slot_idx]; - - auto v = thread_scope->FindVar(var); - CHECK(v != nullptr) << "var[" << var << "] not found"; - - LoDTensor* tensor = v->GetMutable(); - float* tensor_ptr = tensor->mutable_data({batch_size, slot_dim}, platform::CPUPlace()); - memset(tensor_ptr, 0, sizeof(float) * ins_slot_offset.back()); - - LoD data_lod{tensor_lod}; - tensor->set_lod(data_lod); - - slot_input_vec[slot_idx - 1].reset(tensor); - } - */ -} void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - //int table_id = 0; //TODO for (auto table_id: _param_config->sparse_table_id) { PullSparse(table_id); for (auto& t : _pull_sparse_status) { @@ -423,9 +390,7 @@ void AsyncExecutorThreadWorker::UpdateParams() { for (auto i : _param_config->dense_table_id) { PushDense(i); } - // _param_config->tmp_push_dense_wait_times int32_t tmp_push_dense_wait_times = -1; - // _param_config->tmp_push_sparse_wait_times int32_t tmp_push_sparse_wait_times = -1; static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); @@ -509,17 +474,15 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); - //to save time auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); - //binding_slot_embed_with_concat(); TODO - collect_feasign_info(table_id); //TODO + collect_feasign_info(table_id); } void AsyncExecutorThreadWorker::FillSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; // TODO - auto fea_dim = _param_config->fea_dim; //TODO + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& fea_value = _feature_value[table_id]; @@ -544,53 +507,35 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); - //float* ptr = tensor_emb->data(); for (auto index = 0u; index < len; ++index){ - //if (_current_train_job.use_cvm_feature()) { - // if (ids[index] == 0u) { - // memcpy(ptr + slot_dim * index, init_value.data(), sizeof(float) * slot_dim); - // continue; - // } - // memcpy(ptr + slot_dim * index, fea_value[fea_idx].data(), sizeof(float) * slot_dim); - // (ptr + slot_dim * index)[0] = log((ptr + slot_dim * index)[0] + 1); - // (ptr + slot_dim * index)[1] = log((ptr + slot_dim * index)[1] + 1) - (ptr + slot_dim * index)[0]; - // fea_idx++; - //} else { - if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); - continue; - } - memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); - fea_idx++; - //} + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; } } } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; //TODO - auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; - CHECK(features.size() < 1000000) << "features size:" << features.size(); - //std::vector gradient_var; - //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO + CHECK(features.size() < 1000000) << "features size is too big, may be wrong:" << features.size(); auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); uint64_t fea_idx = 0u; auto& fea_info = _fea_info[table_id]; - int offset = 0; - //if (!_current_train_job.use_cvm_feature()) { //TODO - offset = 2; - //} + int offset = 2; const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO + // slot_idx = 0 is label for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { - LOG(ERROR) << "ERROR continue"; continue; } Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); @@ -609,7 +554,6 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; exit(-1); } - //int len = tensor->lod()[0].back(); int len = tensor->numel(); CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 0c9a47690be..4e9c2622b0e 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,7 +155,6 @@ class ExecutorThreadWorker { void SetFetchVarNames(const std::vector& fetch_var_names); virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); virtual void SetPullDenseThread(std::shared_ptr dpt) {}; - virtual void BindingSlotVariableMemory() {}; virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -191,7 +190,6 @@ public: virtual ~AsyncExecutorThreadWorker() {} void SetPSlibPtr(std::shared_ptr pslib_ptr); void SetPullDenseThread(std::shared_ptr dpt); - void BindingSlotVariableMemory(); void SetParamConfig(AsyncWorkerParamConfig* param_config); void TrainFiles(); void TrainOneNetwork(); -- GitLab From c4cb4142916c92d82b3e0924206aac25db4b8758 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 11:54:17 +0800 Subject: [PATCH 0179/2367] refine pslib.cmake url to public --- cmake/external/pslib.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 812af5efa20..4d4dc195aa2 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -29,8 +29,8 @@ INCLUDE(ExternalProject) SET(PSLIB_PROJECT "extern_pslib") IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_VER "pslib" CACHE STRING "" FORCE) #todo pslib version - SET(PSLIB_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib url + SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/pslib.tar.gz" CACHE STRING "" FORCE) ENDIF() MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") -- GitLab From c59cdf3a243e104992ec2cde1e36cb38d452feb4 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 12:12:21 +0800 Subject: [PATCH 0180/2367] refine executor_thread_worker.h and executor_thread_worker.cc code style --- .../fluid/framework/executor_thread_worker.cc | 364 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 92 +++-- 2 files changed, 243 insertions(+), 213 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 592a416d6dc..412f4a2b6ed 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -58,7 +58,8 @@ bool DensePullThread::check_update_param(uint64_t table_id) { { std::lock_guard lock(_mutex_for_version); auto& version = _training_versions[table_id]; - _current_version[table_id] = *(std::min_element(version.begin(), version.end())); + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); } if (_current_version[table_id] - _last_versions[table_id] < _threshold) { return false; @@ -93,7 +94,8 @@ void DensePullThread::wait_all() { t.wait(); auto status = t.get(); if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + LOG(WARNING) << "pull dense failed times:" << + ++_pull_dense_fail_times; } } @@ -105,7 +107,8 @@ void DensePullThread::wait_all() { _pull_dense_status.resize(0); } -void DensePullThread::increase_thread_version(int thread_id, uint64_t table_id) { +void DensePullThread::increase_thread_version( + int thread_id, uint64_t table_id) { std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } @@ -169,10 +172,6 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } -void ExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { - -} - void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ @@ -332,10 +331,12 @@ void AsyncExecutorThreadWorker::TrainFiles() { } // end while () } -void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { +void AsyncExecutorThreadWorker::SetPSlibPtr( + std::shared_ptr pslib_ptr) { _pslib_ptr = pslib_ptr; } -void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptr dpt) { +void AsyncExecutorThreadWorker::SetPullDenseThread( + std::shared_ptr dpt) { _pull_dense_thread = dpt; } void AsyncExecutorThreadWorker::TrainOneNetwork() { @@ -347,7 +348,8 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { } bool need_skip = false; for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + if (op->Type().find(_param_config->skip_op[t]) != + std::string::npos) { need_skip = true; break; } @@ -359,13 +361,13 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { UpdateParams(); } - -void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { +void AsyncExecutorThreadWorker::SetParamConfig( + AsyncWorkerParamConfig* param_config) { _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - for (auto table_id: _param_config->sparse_table_id) { + for (auto table_id : _param_config->sparse_table_id) { PullSparse(table_id); for (auto& t : _pull_sparse_status) { t.wait(); @@ -378,7 +380,7 @@ void AsyncExecutorThreadWorker::PrepareParams() { } _pull_sparse_status.resize(0); - for (auto table_id: _param_config->sparse_table_id) { + for (auto table_id : _param_config->sparse_table_id) { FillSparse(table_id); } } @@ -440,180 +442,198 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; - auto& feature_value = _feature_value[table_id]; - auto fea_dim = _param_config->fea_dim; - // slot id starts from 1 - features.clear(); - features.resize(0); - features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - int len = tensor->numel(); - for (auto i = 0u; i < len; ++i) { - //todo: current trick - filter feasign=use_slot_mod(bug: datafeed fill use_slot_mod for empty slot) - if (ids[i] == 0u) { - continue; - } - features.push_back(static_cast(ids[i])); - } - } - check_pull_push_memory(features, feature_value, fea_dim); - - std::vector pull_feature_value; - for (auto i = 0u; i < features.size(); ++i) { - pull_feature_value.push_back(feature_value[i].data()); - } - for (int i = 0; i < features.size(); ++i) { + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + const std::vector& feed_vec = + thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + // todo(colourful-tree): current trick - filter feasign=use_slot_mod( + // bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); } - auto status = _pslib_ptr->_worker_ptr->pull_sparse( - pull_feature_value.data(), table_id, features.data(), features.size()); - _pull_sparse_status.push_back(std::move(status)); - - auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - - collect_feasign_info(table_id); + } + check_pull_push_memory(features, feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + + collect_feasign_info(table_id); } void AsyncExecutorThreadWorker::FillSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; - auto fea_dim = _param_config->fea_dim; - auto& features = _features[table_id]; - auto& fea_value = _feature_value[table_id]; - - CHECK(features.size() > 0) << "feature size check failed"; - - auto fea_idx = 0u; - - std::vector init_value(fea_dim); - - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); - LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); - memset(ptr, 0, sizeof(float) * len * slot_dim); - auto& tensor_lod = tensor->lod()[0]; - - LoD data_lod{tensor_lod}; - tensor_emb->set_lod(data_lod); - - for (auto index = 0u; index < len; ++index){ - if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); - continue; - } - memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); - fea_idx++; - } + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = + thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar( + _param_config->slot_input_vec[table_id][slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->mutable_data( + {len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, + init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, + fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; } + } } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; - auto fea_dim = _param_config->fea_dim; - auto& features = _features[table_id]; - CHECK(features.size() < 1000000) << "features size is too big, may be wrong:" << features.size(); - auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); - uint64_t fea_idx = 0u; - auto& fea_info = _fea_info[table_id]; - int offset = 2; - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { - LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; - } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { - continue; - } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); - CHECK(g_var != nullptr) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; - LoDTensor* g_tensor = g_var->GetMutable(); - if (g_tensor == NULL) { - LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; - exit(-1); - } - float* g = g_tensor->data(); - - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; - LoDTensor* tensor = var->GetMutable(); - if (tensor == NULL) { - LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; - exit(-1); - } - int len = tensor->numel(); - CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); - CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); - int64_t* ids = tensor->data(); - for (auto id_idx = 0u; id_idx < len; ++id_idx){ - if (ids[id_idx] == 0) { - g += slot_dim; - continue; - } - memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); - push_g[fea_idx][0] = 1.0f; - CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx << " size:" << fea_info.size(); - push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); - g += slot_dim; - fea_idx++; - } + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) << + "push_g size:" << push_g.size() << " features size:" << features.size(); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; + int offset = 2; + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_param_config->slot_alias_to_table.find( + feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx << + " name:" << feed_vec[slot_idx]; + } else if ( + _param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + continue; } - CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx << " features size:" << features.size(); - CHECK(features.size() > 0); - - std::vector push_g_vec; - for (auto i = 0u; i < features.size(); ++i) { - push_g_vec.push_back(push_g[i].data()); + Variable* g_var = thread_scope_->FindVar( + _param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) << "var[" << + _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" << + _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) << + "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << + len << "t_numel:" << tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, + g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << + fea_idx << " size:" << fea_info.size(); + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; } - auto status = _pslib_ptr->_worker_ptr->push_sparse( - table_id, features.data(), (const float**)push_g_vec.data(), features.size()); - _push_sparse_status.push_back(std::move(status)); + } + CHECK(fea_idx == features.size()) << "fea_idx:" << + fea_idx << " features size:" << features.size(); + CHECK_GT(features.size(), 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), + (const float**)push_g_vec.data(), features.size()); + _push_sparse_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::collect_feasign_info( - int table_id) { - auto& fea_info = _fea_info[table_id]; - auto& feature = _features[table_id]; - fea_info.resize(feature.size()); - - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - Variable* var = thread_scope_->FindVar(feed_vec[0]); + int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); - int64_t* label = tensor->data(); - - int global_index = 0; - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - - int fea_idx = 0; - for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { - for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { - if (ids[fea_idx] == 0u) { - continue; - } - FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; - - fea_info[global_index++] = std::move(info); - } + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } } - CHECK(global_index == feature.size()) << "expect fea info size:" << feature.size() - << " real:" << global_index; + } + CHECK(global_index == feature.size()) << + "expect fea info size:" << feature.size() + << " real:" << global_index; } void AsyncExecutorThreadWorker::check_pull_push_memory( diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 4e9c2622b0e..b6c4f950ecc 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -35,21 +35,22 @@ const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; void CreateTensor(Variable* var, proto::VarType::Type var_type); struct AsyncWorkerParamConfig { - int slot_dim; - int fea_dim; - int32_t tmp_push_dense_wait_times; - int32_t tmp_push_sparse_wait_times; - - std::vector skip_op; - - std::map> dense_variable_name; - std::map> dense_gradient_variable_name; - std::vector dense_table_id; - std::vector dense_table_size; // fea_dim for each dense table - std::vector sparse_table_id; - std::map> slot_input_vec; //6048slot 6050slot //name - std::map> gradient_var; //6048slot_embed - std::map slot_alias_to_table; //TODO done + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; + + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + // fea_dim for each dense table + std::vector dense_table_size; + std::vector sparse_table_id; + std::map> slot_input_vec; + std::map> gradient_var; + std::map slot_alias_to_table; }; struct DensePullThreadParam { @@ -62,8 +63,8 @@ struct DensePullThreadParam { }; class DensePullThread { -public: - DensePullThread(DensePullThreadParam& param) : + public: + explicit DensePullThread(const DensePullThreadParam& param) : _running(false) { _ps_client = param.ps_client; _threshold = param.threshold; @@ -96,11 +97,11 @@ public: void pull_dense2(uint64_t table_id); void wait_all(); -private: + private: void run(); bool check_update_param(uint64_t table_id); -private: + private: std::shared_ptr _ps_client; int _thread_num; int _threshold; @@ -153,9 +154,13 @@ class ExecutorThreadWorker { virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); - virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); - virtual void SetPullDenseThread(std::shared_ptr dpt) {}; - virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; + virtual void SetPSlibPtr( + std::shared_ptr pslib_ptr); + virtual void SetPullDenseThread( + std::shared_ptr dpt) {} + virtual void SetParamConfig( + AsyncWorkerParamConfig * param_config) {} + private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -178,32 +183,37 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - //private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; class AsyncExecutorThreadWorker: public ExecutorThreadWorker { -public: - AsyncExecutorThreadWorker(){}; - virtual ~AsyncExecutorThreadWorker() {} - void SetPSlibPtr(std::shared_ptr pslib_ptr); - void SetPullDenseThread(std::shared_ptr dpt); - void SetParamConfig(AsyncWorkerParamConfig* param_config); - void TrainFiles(); - void TrainOneNetwork(); - void PrepareParams(); - void UpdateParams(); - void PullSparse(int table_id); - void FillSparse(int table_id); - void PushSparse(int table_id); - void PushDense(int table_id); - - void check_pull_push_memory(std::vector& features, std::vector& push_g, int dim); - void check_pull_push_memory(std::vector& features, std::vector>& push_g, int dim); + public: + AsyncExecutorThreadWorker() {} + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void SetParamConfig(AsyncWorkerParamConfig* param_config); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory( + const std::vector& features, + std::vector& push_g, + int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>& push_g, + int dim); void collect_feasign_info(int table_id); -private: + + private: struct FeasignInfo { uint32_t slot; uint32_t ins; -- GitLab From 2912d5311bccc3b89dd32a0e80f48be41ba7d1bc Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 13:21:39 +0800 Subject: [PATCH 0181/2367] fix code style bug & change pslib.cmake & change Cmakelist adapt pslib --- CMakeLists.txt | 19 +++++++++++++------ cmake/external/pslib.cmake | 11 ++++++----- paddle/fluid/framework/async_executor.cc | 7 +++---- paddle/fluid/framework/async_executor.h | 4 ++-- paddle/fluid/framework/data_feed.cc | 1 + .../fluid/framework/executor_thread_worker.cc | 4 ++-- .../fluid/framework/executor_thread_worker.h | 2 +- 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b5bf6c5b6c..c3b4349c8c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) @@ -216,9 +217,12 @@ include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) -include(external/libmct) -include(external/pslib_brpc) -include(external/pslib) + +if(WITH_PSLIB) + include(external/libmct) + include(external/pslib_brpc) + include(external/pslib) +endif() if(WITH_DISTRIBUTE) if(WITH_GRPC) @@ -279,11 +283,14 @@ set(EXTERNAL_LIBS protobuf zlib ${PYTHON_LIBRARIES} - pslib - pslib_brpc - libmct ) +if(WITH_PSLIB) + list(APPEND EXTERNAL_LIBS pslib) + list(APPEND EXTERNAL_LIBS pslib_brpc) + list(APPEND EXTERNAL_LIBS libmct) +endif(WITH_PSLIB) + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 4d4dc195aa2..3b495d78e2c 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -30,9 +30,10 @@ SET(PSLIB_PROJECT "extern_pslib") IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) MESSAGE(STATUS "use pre defined download url") SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) - SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/pslib.tar.gz" CACHE STRING "" FORCE) + SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") +MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") SET(PSLIB_DST_DIR "pslib") @@ -50,7 +51,7 @@ INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(PSLIB)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${PSLIB_VER}/include ${PSLIB_VER}/lib \n" + "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" " DESTINATION ${PSLIB_DST_DIR})\n") ExternalProject_Add( @@ -58,8 +59,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PSLIB_SOURCE_DIR} DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_VER}.tar.gz - && tar zxvf ${PSLIB_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index c62d62a5dc4..8231aff1429 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -50,7 +50,6 @@ void AsyncExecutor::CreateThreads( worker->BindingDataFeedMemory(); worker->SetPSlibPtr(_pslib_ptr); worker->SetPullDenseThread(_pull_dense_thread); - worker->BindingSlotVariableMemory(); worker->SetParamConfig(&_param_config); } @@ -79,7 +78,7 @@ void AsyncExecutor::InitWorker(const std::string& dist_desc, _pslib_ptr = std::shared_ptr( new paddle::distributed::PSlib()); _pslib_ptr->init_worker( - dist_desc, host_sign_list.data(), node_num, index); + dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index); InitParamConfig(); } @@ -93,8 +92,8 @@ void AsyncExecutor::StopServer() { } void AsyncExecutor::GatherServers( - std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers(host_sign_list.data(), node_num); + const std::vector& host_sign_list, int node_num) { + _pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num); } void AsyncExecutor::InitParamConfig() { diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 184566dd39e..16540c2df2b 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -43,9 +43,9 @@ inline std::default_random_engine& local_random_engine() { struct engine_wrapper_t { std::default_random_engine engine; engine_wrapper_t() { - static std::atomic x(0); + static std::atomic x(0); std::seed_seq sseq = {x++, x++, x++, - static_cast(current_realtime() * 1000)}; + static_cast(current_realtime() * 1000)}; engine.seed(sseq); } }; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 851c7eda89e..54a00f8ccf2 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -68,6 +68,7 @@ bool DataFeed::PickOneFile(std::string* filename) { return false; } *filename = filelist_[file_idx_++]; + LOG(ERROR) << "pick file:" << *filename; return true; } diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 412f4a2b6ed..df15a4d293a 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -637,7 +637,7 @@ void AsyncExecutorThreadWorker::collect_feasign_info( } void AsyncExecutorThreadWorker::check_pull_push_memory( - std::vector& features, + const std::vector& features, std::vector>& push_g, int dim) { push_g.resize(features.size() + 1); @@ -647,7 +647,7 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( } void AsyncExecutorThreadWorker::check_pull_push_memory( - std::vector& features, + const std::vector& features, std::vector& push_g, int dim) { if (features.size() > push_g.size()) { diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index b6c4f950ecc..93373b1d2eb 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,7 +155,7 @@ class ExecutorThreadWorker { // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); virtual void SetPSlibPtr( - std::shared_ptr pslib_ptr); + std::shared_ptr pslib_ptr) {}; virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( -- GitLab From 23eb8c4299ce9908d07505df413c4a2b79f14d32 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:02:15 +0800 Subject: [PATCH 0182/2367] fix ci test=develop --- .../framework/details/multi_devices_graph_pass.cc | 10 +++++++--- paddle/fluid/operators/reader/ctr_reader.h | 2 +- paddle/fluid/pybind/pybind.cc | 13 ++++++++++++- .../unittests/test_parallel_executor_dry_run.py | 10 ++++++---- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e264906b57f..6c4e0e9168a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,12 +386,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - // insert synchronous ops at the backpropagation; and - // insert synchronous ops if the graph contains mutilple places. +// insert synchronous ops at the backpropagation; and +// insert synchronous ops if the graph contains mutilple places. + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && (places_.size() > 1 || num_trainers > 1 || (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { +#else + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { +#endif // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 517d6697443..635483158fc 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -95,7 +95,7 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, thread_id, &read_thread_status_, queue_))); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9cebdda6938..3beb93e7b3e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -789,7 +789,18 @@ All parameter, weight, gradient are variables in Paddle. [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { self.type_ = type; }, - R"DOC()DOC"); + R"DOC(The type is ExecutorType which is the enum ranging from Default, +ParallelGraph and Experiment: + +Default: Compile the main_program into a multi-devices graph, + and execute this graph on multi-devices with multiple threads which + specified by build_strategy.num_threads. +ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one + device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. + This approach can achieve better performance in some scenarios. +Experimental: Compile the main_program into a multi-devices graph, + and executor this graph with a faster execution mode than the Default, + this approach is on the experiments.)DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 18d95c94ad3..eff76ce0d49 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,6 +17,8 @@ import unittest import logging import six +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestBase(unittest.TestCase): def main(self, @@ -24,7 +26,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - use_experimental_executor=False): + exec_type=ExecutorType.Default): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -43,7 +45,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor + exe_strategy.executor_type = exec_type pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -56,11 +58,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for use_experimental_executor in (False, True): + for exec_type in (ExecutorType.Default, ExecutorType.Experimental): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - use_experimental_executor=use_experimental_executor) + exec_type=exec_type) @staticmethod def network_func(): -- GitLab From fa1f77e20ca2134f52ab01049a7070a2f0a9a3c8 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Dec 2018 18:14:03 +0800 Subject: [PATCH 0183/2367] enable ci test=develop --- .../fluid/tests/unittests/test_dist_base.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 26fa20291b5..84566512667 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -224,6 +224,7 @@ class TestDistBase(unittest.TestCase): def setUp(self): self._trainers = 2 self._pservers = 2 + self._port_set = set() self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable @@ -238,9 +239,17 @@ class TestDistBase(unittest.TestCase): self._after_setup_config() def _find_free_port(self): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - return s.getsockname()[1] + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") -- GitLab From deb0d41cea15db2b24aff269e2f84bd68eeaa919 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 13:33:30 +0000 Subject: [PATCH 0184/2367] fix cmake fix cmake again test=develop --- paddle/fluid/operators/CMakeLists.txt | 9 +++++--- paddle/fluid/operators/py_func_op.cc | 4 ++-- paddle/fluid/operators/py_func_op.h | 2 +- paddle/fluid/pybind/CMakeLists.txt | 3 +++ python/paddle/fluid/layers/nn.py | 31 ++++++++++----------------- 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9379122faf3..23508ebe7c6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -42,8 +42,7 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() -register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) - +register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above if (WITH_GPU AND NOT WIN32) @@ -82,7 +81,7 @@ endif() # op_library(unstack_op DEPS stack_op) # op_library(tensor_array_to_tensor_op DEPS concat_op) -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS} python pybind) +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) @@ -94,4 +93,8 @@ cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) +if (WITH_PYTHON) + cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) +endif() + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 32c44c3bc22..90a64333668 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -namespace py = pybind11; +namespace py = ::pybind11; static std::vector g_py_callables; @@ -30,7 +30,7 @@ const char kForwardPythonCallableId[] = "forward_callable_id"; const char kBackwardPythonCallableId[] = "backward_callable_id"; const char kPyFuncBackwardSkipVars[] = "backward_skip_vars"; -size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { +size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) { g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h index e85fa6b5bc3..4ba06bf5985 100644 --- a/paddle/fluid/operators/py_func_op.h +++ b/paddle/fluid/operators/py_func_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -size_t AppendPythonCallableObjectAndReturnId(pybind11::object py_obj); +size_t AppendPythonCallableObjectAndReturnId(const ::pybind11::object &py_obj); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb1262..b75790e4fe5 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,8 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +if(WITH_PYTHON) + list(APPEND PYBIND_DEPS py_func_op) +endif() set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) if(WITH_PYTHON) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d71368644d8..debe0ff0c94 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9173,31 +9173,22 @@ class PyFuncWrapper(object): kwargs[arg] = args[idx] idx += 1 - ret0 = self._func(*args[idx:], **kwargs) - if ret0 is None: - return None - - if not isinstance(ret0, (list, tuple)): - ret0 = (ret0, ) + func_ret = self._func(*args[idx:], **kwargs) + if not isinstance(func_ret, (list, tuple)): + func_ret = (func_ret, ) ret = [] - for i in six.moves.range(len(ret0)): - if ret0[i] is None: - ret.append(None) - continue - - if isinstance(ret0[i], core.LoDTensor): - ret.append(ret0[i]) + for each_ret in func_ret: + if each_ret is None or isinstance(each_ret, core.LoDTensor): + ret.append(each_ret) continue - if isinstance(ret0[i], np.ndarray): - r = ret0[i] - else: - r = np.array(ret0[i]) + if not isinstance(each_ret, np.ndarray): + each_ret = np.array(each_ret) - t = core.LoDTensor() - t.set(r, core.CPUPlace()) - ret.append(t) + tensor = core.LoDTensor() + tensor.set(each_ret, core.CPUPlace()) + ret.append(tensor) return tuple(ret) -- GitLab From affdd70976f1ad2a2de959ceb082b95791961d91 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:26:52 +0800 Subject: [PATCH 0185/2367] fix api.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078eb..db7b0d34a30 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -27,6 +27,7 @@ paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None -- GitLab From 2328bee1cc835d789b83cd4da9bef6b588bc87c5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 13 Dec 2018 06:34:09 +0000 Subject: [PATCH 0186/2367] fix Windows compile bug test=develop --- .../framework/details/eager_deletion_op_handle.cc | 6 +++--- paddle/fluid/framework/executor.cc | 10 ++++++---- paddle/fluid/framework/tensor.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 3b27415e431..abacb11e3b0 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -77,14 +77,14 @@ void EagerDeletionOpHandle::RunImpl() { VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->mutable_value()->MoveMemoryHolder()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 16c4552a5f0..0c4bd336c5b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -106,14 +106,16 @@ static void DeleteUnusedTensors( VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); - } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto* lod_tensor_arr = var->GetMutable(); for (auto& t : *lod_tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 9f7027f5ae8..153222506af 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,7 +158,7 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } - std::shared_ptr MoveMemory() { + std::shared_ptr MoveMemoryHolder() { return std::move(holder_); } -- GitLab From 15550a27536012a92e2e7badaee7b41afff31f3e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 14:46:22 +0800 Subject: [PATCH 0187/2367] Polish code --- cmake/external/python.cmake | 6 +- .../fluid/operators/math/matrix_bit_code.cc | 486 ++++++++++++------ paddle/fluid/operators/math/matrix_bit_code.h | 63 +-- 3 files changed, 340 insertions(+), 215 deletions(-) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798c..52ad02a3551 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,8 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp ${PY_VERSION}) -FIND_PACKAGE(PythonLibs ${PY_VERSION}) +FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) +FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" @@ -79,6 +79,6 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) - +message(STATUS ${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 92affa0e4ed..d55e832cc2d 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -15,225 +15,379 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include #include + namespace paddle { namespace operators { namespace math { template -void MatrixBitCodeFunctor::Add(const framework::Tensor& vec, - framework::Tensor* tmat) { - size_t batch_size = tmat->dims()[0]; - size_t width = tmat->dims()[1]; - auto* tmat_data = tmat->data(); - auto* vec_data = vec.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - tmat_data[i * width + j] += vec_data[index]; +struct MatrixBitCodeFunctorAdd : public boost::static_visitor { + const framework::Tensor &vec_; + framework::Tensor *tmat_; + + MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat) + : vec_(vec), tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_->dims()[0]; + size_t width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + auto *vec_data = vec_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + tmat_data[i * width + j] += vec_data[index]; + } } } +}; + +template +void MatrixBitCodeFunctor::Add(const framework::Tensor &vec, + framework::Tensor *tmat) { + MatrixBitCodeFunctorAdd func(vec, tmat); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::Tensor* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - auto* vec_data = vec->data(); - auto* tmat_data = tmat.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - vec_data[index] += tmat_data[i * width + j]; +struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *vec_; + MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + vec_data[index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) { + MatrixBitCodeFunctorAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - auto* vec_data = vec->mutable_value()->data(); - auto* tmat_data = tmat.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; +struct MatrixBitCodeFunctorSelectedRowsAddGrad + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *vec_; + + MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->mutable_value()->template data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + int64_t row_index = vec_->GetIndexFromId(static_cast(index)); + vec_data[row_index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) { + MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, - framework::Tensor* sum, T scale_sum) { - size_t num_samples = tmat.dims()[0]; - size_t o_width = tmat.dims()[1]; - auto* tmat_data = tmat.data(); - auto* sum_data = sum->data(); - for (size_t i = 0; i < num_samples; ++i) { - T sm = static_cast(0.0); - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - // calc_bit starts from right most bit, while data in tmat[i] is in the - // reverse order. - sm += tmat_data[i * o_width + j]; +struct MatrixBitCodeFunctorSum : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *sum_; + T scale_sum_; + + MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum, + T scale_sum) + : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t o_width = tmat_.dims()[1]; + auto *tmat_data = tmat_.data(); + auto *sum_data = sum_->data(); + for (size_t i = 0; i < num_samples; ++i) { + T sm = static_cast(0.0); + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + // calc_bit starts from right most bit, while data in tmat[i] is in + // the + // reverse order. + sm += tmat_data[i * o_width + j]; + } } + sum_data[i] = scale_sum_ * sm; } - sum_data[i] = scale_sum * sm; } +}; + +template +void MatrixBitCodeFunctor::Sum(const framework::Tensor &tmat, + framework::Tensor *sum, T scale_sum) { + MatrixBitCodeFunctorSum func(tmat, sum, scale_sum); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, - const framework::Tensor& weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat->dims()[0]; - size_t tmat_width = tmat->dims()[1]; - size_t input_width = input.dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat->data(); - auto weight_value = weight.data(); - auto input_value = input.data(); - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_row = input_value + input_width * i; - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - const T* weight_row = weight_value + weight_width * index; - T sum = static_cast(0.0); - sum = blas.DOT(input_width, weight_row, input_row); - tmat_value[i * tmat_width + j] += sum; +struct MatrixBitCodeFunctorMul : public boost::static_visitor { + framework::Tensor *tmat_; + const framework::Tensor &weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_->dims()[0]; + size_t tmat_width = tmat_->dims()[1]; + size_t input_width = input_.dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_->data(); + auto weight_value = weight_.data(); + auto input_value = input_.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_row = input_value + input_width * i; + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + const T *weight_row = weight_value + weight_width * index; + T sum = blas.DOT(input_width, weight_row, input_row); + tmat_value[i * tmat_width + j] += sum; + } } } +}; + +template +void MatrixBitCodeFunctor::Mul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMul func(tmat, weight, input); + code_table_.apply_visitor(func); } +template +class ReservedVector : public std::vector { + public: + ReservedVector() { this->reserve(N); } +}; + template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::Tensor* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->data(); - auto input_value = input.data(); - - std::map>> ops; - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *weight_; + const framework::Tensor &input_; + MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->data(); + auto input_value = input_.data(); + + std::map, 8u>> ops; + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& op : ops) { - auto& op_in_row = op.second; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - T* weight_row = weight_value + op.first * weight_width; - blas.AXPY(input_width, scale, input_row, weight_row); + for (auto &op : ops) { + auto &op_in_row = op.second; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + T *weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); + } } } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeight func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::SelectedRows* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->value().dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->mutable_value()->data(); - auto input_value = input.data(); - - std::unordered_map>> ops; - ops.reserve(weight->rows().size()); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeightSR + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->value().dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->mutable_value()->data(); + auto input_value = input_.data(); + + std::unordered_map>> ops; + ops.reserve(weight_->rows().size()); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& row : weight->rows()) { - auto& op_in_row = ops[row]; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - blas.AXPY(input_width, scale, input_row, weight_value); + for (auto &row : weight_->rows()) { + auto &op_in_row = ops[row]; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); + } + weight_value += weight_width; } - weight_value += weight_width; } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeightSR func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, - const framework::Tensor& weight, - framework::Tensor* input) { - size_t num_samples = tmat.dims()[0]; - size_t tmat_width = tmat.dims()[1]; - size_t input_width = input->dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight.data(); - auto input_value = input->data(); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - input_value[input_width * i + k] += - tmat_value[i * tmat_width + j] * - weight_value[weight_width * index + k]; +struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor { + const framework::Tensor &tmat_; + const framework::Tensor &weight_; + framework::Tensor *input_; + + MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t tmat_width = tmat_.dims()[1]; + size_t input_width = input_->dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_.data(); + auto input_value = input_->data(); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + input_value[input_width * i + k] += + tmat_value[i * tmat_width + j] * + weight_value[weight_width * index + k]; + } } } } +}; + +template +void MatrixBitCodeFunctor::MulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) { + MatrixBitCodeFunctorMulGradError func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { - size_t num_samples = tmat->dims()[0]; - size_t o_width = tmat->dims()[1]; - auto* tmat_data = tmat->data(); - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - tmat_data[i * o_width + j] -= 1; +struct MatrixBitCodeFunctorSub : public boost::static_visitor { + framework::Tensor *tmat_; + + explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_->dims()[0]; + size_t o_width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + tmat_data[i * o_width + j] -= 1; + } } } } +}; + +template +void MatrixBitCodeFunctor::Sub(framework::Tensor *tmat) { + MatrixBitCodeFunctorSub func(tmat); + code_table_.apply_visitor(func); } template class MatrixBitCodeFunctor; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index cf43ad9d449..01e4889d34a 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" #if defined(_WIN32) #include @@ -99,24 +100,7 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 -// set a code interface to create multiple code -class Code { - public: - virtual ~Code() {} - virtual size_t calc_index(int bit) const = 0; - virtual bool calc_bit(int bit) const = 0; - virtual int get_length() const = 0; -}; -// set a CodeTable interface to create multiple code table -class CodeTable { - public: - virtual Code* get_code(int64_t code) const = 0; - virtual size_t size() const = 0; - virtual int get_max_code_length() const = 0; - virtual ~CodeTable() {} -}; - -class SimpleCode : public Code { +class SimpleCode { public: SimpleCode(size_t code, size_t num_classes, const int64_t* ids) : c_(static_cast(ids[code]) + num_classes) {} @@ -138,7 +122,7 @@ class SimpleCode : public Code { }; template -class CustomCode : public Code { +class CustomCode { public: CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids, int index) { @@ -155,11 +139,11 @@ class CustomCode : public Code { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const override { return ptable_data_[bit]; } - bool calc_bit(int bit) const override { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return ptable_data_[bit]; } + bool calc_bit(int bit) const { return pcode_data_[bit]; } // NOTE: this function is not thread-safe. - int get_length() const override { + int get_length() const { if (length_ < 0) { auto len = seq_len_; length_ = @@ -177,46 +161,32 @@ class CustomCode : public Code { mutable int length_{-1}; }; -class SimpleCodeTable : public CodeTable { +class SimpleCodeTable { public: SimpleCodeTable(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids) {} - Code* get_code(int64_t code) const { - auto it = codes_.find(code); - if (it != codes_.end()) { - return it->second.get(); - } - auto* result = new SimpleCode(code, num_classes_, ids_); - codes_.emplace(code, std::unique_ptr(result)); - return result; + SimpleCode get_code(int64_t code) const { + return SimpleCode(code, num_classes_, ids_); } size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } private: - mutable std::map> codes_; - size_t num_classes_; const int64_t* ids_; }; template -class CustomCodeTable : public CodeTable { +class CustomCodeTable { public: CustomCodeTable(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} - Code* get_code(int64_t code) const { - auto it = codes_.find(code); - if (it != codes_.end()) { - return it->second.get(); - } - auto* result = new CustomCode(ptable_, pcode_, ids_, code); - codes_.emplace(code, std::unique_ptr(result)); - return result; + CustomCode get_code(int64_t code) const { + return CustomCode(ptable_, pcode_, ids_, code); } size_t size() const { return static_cast(ptable_.dims()[1]); } @@ -225,25 +195,26 @@ class CustomCodeTable : public CodeTable { } private: - mutable std::unordered_map> codes_; const framework::Tensor& ptable_; const framework::Tensor& pcode_; const int64_t* ids_; }; +using CodeTable = boost::variant>; + template class MatrixBitCodeFunctor { public: MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids), - code_table_(new SimpleCodeTable(num_classes, ids)) {} + code_table_(SimpleCodeTable(num_classes, ids)) {} MatrixBitCodeFunctor(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : num_classes_(static_cast(ptable.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(ptable, pcode, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -293,7 +264,7 @@ class MatrixBitCodeFunctor { size_t num_classes_; const int64_t* ids_; - std::unique_ptr code_table_; + CodeTable code_table_; }; } // namespace math } // namespace operators -- GitLab From bf951fa737d62b27f1e50b2b3019815b1a6efda9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 06:48:34 +0000 Subject: [PATCH 0188/2367] add refer vrelu, videntity, vexp, vsigmoid, vtanh and test and benchmark --- paddle/fluid/operators/jit/benchmark.cc | 77 ++++++++++++++ paddle/fluid/operators/jit/helper.cc | 30 +++--- paddle/fluid/operators/jit/kernel_base.h | 13 ++- .../fluid/operators/jit/refer/CMakeLists.txt | 5 + paddle/fluid/operators/jit/refer/refer.cc | 6 ++ paddle/fluid/operators/jit/refer/refer.h | 51 +++++++++ paddle/fluid/operators/jit/test.cc | 100 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_refer.h | 40 ------- 8 files changed, 267 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 2ad87e414bd..a7e5eb6cf4a 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -201,6 +201,77 @@ void BenchAXYNKernel() { } } +// return this function avg time +template +double BenchXYNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, + std::vector& y) { // NOLINT + const T* x_data = x.data(); + T* y_data = y.data(); + const int d = y.size(); + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(x_data, y_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(x_data, y_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchXYNKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + std::vector> infos; + std::vector x(d), y(d); + RandomVec(d, x.data()); + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchXYNFunc>(refer, x, y); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + auto res = BenchXYNFunc>(jitcode, x, y); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = BenchXYNFunc>(more, x, y); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchXYNFunc>(tgt, x, y); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -222,4 +293,10 @@ int main(int argc, char* argv[]) { BenchAXYNKernel(); BenchAXYNKernel(); + + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index c9aaffb8b8d..c010b64c9cb 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -19,28 +19,30 @@ namespace paddle { namespace operators { namespace jit { +#define ONE_CASE(key) \ + case key: \ + return #key + const char* to_string(KernelType kt) { switch (kt) { - case vmul: - return "vmul"; - case vadd: - return "vadd"; - case vaddrelu: - return "vaddrelu"; - case vsub: - return "vsub"; - case vscal: - return "vscal"; - case vexp: - return "vexp"; - case vaddbias: - return "vaddbias"; + ONE_CASE(vmul); + ONE_CASE(vadd); + ONE_CASE(vaddrelu); + ONE_CASE(vsub); + ONE_CASE(vscal); + ONE_CASE(vaddbias); + ONE_CASE(vrelu); + ONE_CASE(videntity); + ONE_CASE(vexp); + ONE_CASE(vsigmoid); + ONE_CASE(vtanh); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; } return nullptr; } +#undef ONE_CASE } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 74ecf3dade5..29b881b7540 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -26,7 +26,11 @@ typedef enum { vsub, vscal, vaddbias, - vexp + vrelu, + videntity, + vexp, + vsigmoid, + vtanh } KernelType; template @@ -39,6 +43,13 @@ struct XYZNTuples { template struct AXYNTuples : public XYZNTuples {}; +template +struct XYNTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index afe3f6ca0f4..dc07ddb914b 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -13,3 +13,8 @@ USE_JITKERNEL_REFER(vaddrelu) USE_JITKERNEL_REFER(vsub) USE_JITKERNEL_REFER(vscal) USE_JITKERNEL_REFER(vaddbias) +USE_JITKERNEL_REFER(vrelu) +USE_JITKERNEL_REFER(videntity) +USE_JITKERNEL_REFER(vexp) +USE_JITKERNEL_REFER(vsigmoid) +USE_JITKERNEL_REFER(vtanh) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 4e9c530344b..f716ca89c58 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -29,4 +29,10 @@ REGISTER_REFER_KERNEL(vsub, VSub); REGISTER_REFER_KERNEL(vscal, VScal); REGISTER_REFER_KERNEL(vaddbias, VAddBias); +REGISTER_REFER_KERNEL(vrelu, VRelu); +REGISTER_REFER_KERNEL(videntity, VIdentity); +REGISTER_REFER_KERNEL(vexp, VExp); +REGISTER_REFER_KERNEL(vsigmoid, VSigmoid); +REGISTER_REFER_KERNEL(vtanh, VTanh); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 32ac5bf2d78..7ef60a2d539 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -66,6 +66,50 @@ void VAddBias(const T* a, const T* x, T* y, int n) { } } +template +void VRelu(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +inline void VIdentity(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i]; + } +} + +template +void VExp(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoid(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanh(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -83,6 +127,13 @@ DECLARE_REFER_KERNEL(VSub, XYZNTuples); DECLARE_REFER_KERNEL(VScal, AXYNTuples); DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +// const T* x, T* y, int n +DECLARE_REFER_KERNEL(VRelu, XYNTuples); +DECLARE_REFER_KERNEL(VIdentity, XYNTuples); +DECLARE_REFER_KERNEL(VExp, XYNTuples); +DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); +DECLARE_REFER_KERNEL(VTanh, XYNTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index ea2cb7b7a42..4c9b853b6e6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -250,6 +250,106 @@ TEST(JITKernel, vaddbias) { TestAXYNKernel(); } +template +void TestXYNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); +} + +template +void TestXYNKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + + std::vector x(d), yref(d); + std::vector xinp(d); // inplace test + RandomVec(d, x.data()); + std::copy(x.begin(), x.end(), xinp.begin()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* xinp_data = xinp.data(); + // test refer code inplace + ref(x_data, yref_data, d); + ref(xinp_data, xinp_data, d); + ExpectEQ(xinp_data, yref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestXYNFunc>(jitcode, x, yref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestXYNFunc>(more, x, yref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get, PlaceType>(d); + TestXYNFunc>(tgt, x, yref); + } +} + +TEST(JITKernel, vrelu) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, videntity) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vexp) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vsigmoid) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vtanh) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + TEST(JITKernel, pool) { // TODO(TJ): add some test } diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index b5ee07e7488..a03e851de56 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,46 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void VRelu(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - -template -inline void VIdentity(const T* x, T* y, int n) {} - -template -void VExp(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoid(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanh(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - template void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT if (type == "sigmoid") { -- GitLab From 1bac8f918c9fca90db7e95dd2f27d7946ffebc40 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 15:00:20 +0800 Subject: [PATCH 0189/2367] fix api.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index db7b0d34a30..6c6026911b3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,8 +26,8 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None -- GitLab From f81957a7531d7cdb4e4f0a96c0d0f5f8752c92b7 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 15:12:37 +0800 Subject: [PATCH 0190/2367] refine cmake for pslib & pre_define --- CMakeLists.txt | 2 +- cmake/configure.cmake | 4 ++++ cmake/external/libmct.cmake | 13 +++++++------ cmake/external/pslib_brpc.cmake | 15 ++++++++------- paddle/fluid/framework/CMakeLists.txt | 7 ++++++- paddle/fluid/framework/async_executor.cc | 14 ++++++++++++++ paddle/fluid/framework/async_executor.h | 11 +++++++---- paddle/fluid/framework/executor_thread_worker.cc | 6 +++++- paddle/fluid/framework/executor_thread_worker.h | 12 ++++++++++-- paddle/fluid/pybind/async_executor_py.cc | 11 +++++++++++ 10 files changed, 73 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3b4349c8c3..68eb8718ee8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,7 +222,7 @@ if(WITH_PSLIB) include(external/libmct) include(external/pslib_brpc) include(external/pslib) -endif() +endif(WITH_PSLIB) if(WITH_DISTRIBUTE) if(WITH_GRPC) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4e17ddee739..03076c44c3b 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -84,6 +84,10 @@ if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) endif(NOT WITH_GOLANG) +if(WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 239183cb6d0..27cff8cfb63 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -29,10 +29,11 @@ INCLUDE(ExternalProject) SET(LIBMCT_PROJECT "extern_libmct") IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) MESSAGE(STATUS "use pre defined download url") - SET(LIBMCT_VER "libmct" CACHE STRING "" FORCE) #todo libmct version - SET(LIBMCT_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${LIBMCT_VER}.tar.gz" CACHE STRING "" FORCE) #todo libmct url + SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) + SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) + SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "LIBMCT_VER: ${LIBMCT_VER}, LIBMCT_URL: ${LIBMCT_URL}") +MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") SET(LIBMCT_DST_DIR "libmct") @@ -47,7 +48,7 @@ INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(LIBMCT)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${LIBMCT_VER}/include ${LIBMCT_VER}/lib \n" + "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" " DESTINATION ${LIBMCT_DST_DIR})\n") ExternalProject_Add( @@ -55,8 +56,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LIBMCT_SOURCE_DIR} DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_VER}.tar.gz - && tar zxvf ${LIBMCT_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz + && tar zxvf ${LIBMCT_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 92019eef26f..7ff5a8aca18 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -27,12 +27,13 @@ ENDIF() INCLUDE(ExternalProject) SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") -IF((NOT DEFINED PSLIB_BRPC_VER) OR (NOT DEFINED PSLIB_BRPC_URL)) +IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_BRPC_VER "pslib_brpc" CACHE STRING "" FORCE) #todo pslib version - SET(PSLIB_BRPC_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_BRPC_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib_brpc url + SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "PSLIB_BRPC_VER: ${PSLIB_BRPC_VER}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") SET(PSLIB_BRPC_DST_DIR "pslib_brpc") @@ -50,7 +51,7 @@ INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(PSLIB_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${PSLIB_BRPC_VER}/include ${PSLIB_BRPC_VER}/lib \n" + "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") ExternalProject_Add( @@ -58,8 +59,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PSLIB_BRPC_SOURCE_DIR} DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_VER}.tar.gz - && tar zxvf ${PSLIB_BRPC_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz + && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6fdc73e93ae..f3d66cd8830 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +if(WITH_PSLIB) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +else() + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +endif(WITH_PSLIB) + cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 8231aff1429..fe6488f4b6f 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,7 +29,9 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#ifdef PADDLE_WITH_PSLIB #include "pslib.h" +#endif namespace paddle { namespace framework { @@ -48,9 +50,11 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); +#ifdef PADDLE_WITH_PSLIB worker->SetPSlibPtr(_pslib_ptr); worker->SetPullDenseThread(_pull_dense_thread); worker->SetParamConfig(&_param_config); +#endif } void PrepareReaders(std::vector>& readers, // NOLINT @@ -64,6 +68,7 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +#ifdef PADDLE_WITH_PSLIB void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { _pslib_ptr = std::shared_ptr( @@ -231,6 +236,7 @@ void AsyncExecutor::PrepareDenseThread(const std::string& mode) { _pull_dense_thread->start(); } } +#endif void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, @@ -279,15 +285,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); +#ifdef PADDLE_WITH_PSLIB PrepareDenseThread(mode); +#endif std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { +#ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { worker.reset(new AsyncExecutorThreadWorker); } else { worker.reset(new ExecutorThreadWorker); } +#else + worker.reset(new ExecutorThreadWorker); +#endif } // prepare thread resource here @@ -306,9 +318,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } +#ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { _pull_dense_thread->stop(); } +#endif root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 16540c2df2b..d6f16d91338 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -64,6 +64,7 @@ class AsyncExecutor { const std::vector& fetch_names, const std::string& mode, const bool debug = false); +#ifdef PADDLE_WITH_PSLIB void InitServer(const std::string& dist_desc, int index); void InitWorker( const std::string& dist_desc, @@ -75,7 +76,7 @@ class AsyncExecutor { void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); - +#endif private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -83,16 +84,18 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); +#ifdef PADDLE_WITH_PSLIB void PrepareDenseThread(const std::string& mode); - +#endif public: +#ifdef PADDLE_WITH_PSLIB std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; + AsyncWorkerParamConfig _param_config; +#endif Scope* root_scope_; platform::Place place_; - AsyncWorkerParamConfig _param_config; - private: int actual_thread_num; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index df15a4d293a..a58c2692204 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -31,6 +31,7 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB int DensePullThread::start() { _running = true; _t = std::thread(&DensePullThread::run, this); @@ -112,7 +113,8 @@ void DensePullThread::increase_thread_version( std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } - +#endif + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -302,6 +304,7 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +#ifdef PADDLE_WITH_PSLIB // AsyncExecutor void AsyncExecutorThreadWorker::TrainFiles() { SetDevice(); @@ -659,6 +662,7 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( } } } +#endif } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 93373b1d2eb..c23eb09470d 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,14 +25,16 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB #include "pslib.h" +#endif namespace paddle { namespace framework { -const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; - void CreateTensor(Variable* var, proto::VarType::Type var_type); +#ifdef PADDLE_WITH_PSLIB +const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; struct AsyncWorkerParamConfig { int slot_dim; @@ -130,6 +132,8 @@ class DensePullThread { float _total_batch_num = 0; }; +#endif + class ExecutorThreadWorker { public: ExecutorThreadWorker() @@ -154,12 +158,14 @@ class ExecutorThreadWorker { virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); +#ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( std::shared_ptr pslib_ptr) {}; virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( AsyncWorkerParamConfig * param_config) {} +#endif private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -188,6 +194,7 @@ class ExecutorThreadWorker { bool debug_; }; +#ifdef PADDLE_WITH_PSLIB class AsyncExecutorThreadWorker: public ExecutorThreadWorker { public: AsyncExecutorThreadWorker() {} @@ -238,6 +245,7 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { AsyncWorkerParamConfig* _param_config; }; +#endif } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 8dfba0d2694..71a0e256e43 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -41,6 +41,7 @@ namespace pd = paddle::framework; namespace paddle { namespace pybind { using set_name_func = void (pd::DataFeedDesc::*)(const std::string&); +#ifdef PADDLE_WITH_PSLIB void BindAsyncExecutor(py::module* m) { py::class_(*m, "AsyncExecutor") .def(py::init([](framework::Scope* scope, const platform::Place& place) { @@ -56,5 +57,15 @@ void BindAsyncExecutor(py::module* m) { .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor +#else +void BindAsyncExecutor(py::module* m) { + py::class_(*m, "AsyncExecutor") + .def(py::init([](framework::Scope* scope, const platform::Place& place) { + return std::unique_ptr( + new framework::AsyncExecutor(scope, place)); + })) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile) +} // end BindAsyncExecutor +#endif } // end namespace pybind } // end namespace paddle -- GitLab From aa38fc4ce5cb73e01b614ff57fae9553dcf30abf Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 15:20:40 +0800 Subject: [PATCH 0191/2367] Fix compile test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 8209a049f46..4c8bce4600a 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -373,7 +373,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, } for (size_t i = 0; i < a_size; i++) { - if (a.type() == typeid(float)) { + if (a.type() == framework::proto::VarType::FP32) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { @@ -382,7 +382,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, b_data[i]); return false; } - } else if (a.type() == typeid(int64_t)) { + } else if (a.type() == framework::proto::VarType::INT64) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { -- GitLab From 95b887c4f26c794e2b01daa5c97b32582de7c56a Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 15:30:31 +0800 Subject: [PATCH 0192/2367] remove commit --- paddle/fluid/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d980b36d9be..6b526f0103a 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) -#add_subdirectory(distributed) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) -- GitLab From c9b799896e6b78a4248cd8c9288ab6adacf628ad Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 15:33:45 +0800 Subject: [PATCH 0193/2367] fix tag in async_executor --- paddle/fluid/framework/async_executor.cc | 238 ++++++++--------- paddle/fluid/framework/async_executor.h | 5 +- .../fluid/framework/executor_thread_worker.cc | 249 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 178 ++++++------- 4 files changed, 336 insertions(+), 334 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index fe6488f4b6f..0fe7f3bd5c0 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -102,139 +102,139 @@ void AsyncExecutor::GatherServers( } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < - _pslib_ptr->get_param()->server_param().\ - downpour_server_param().\ - downpour_table_param_size(); - ++i) { - if (_pslib_ptr->get_param()->server_param().\ - downpour_server_param().downpour_table_param(i).\ - table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param().\ - downpour_server_param().\ - downpour_table_param(i).\ - accessor().fea_dim(); - break; - } + for (int i = 0; i < + _pslib_ptr->get_param()->server_param(). \ + downpour_server_param(). \ + downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param()->server_param(). \ + downpour_server_param().downpour_table_param(i). \ + table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param(). \ + downpour_server_param(). \ + downpour_table_param(i). \ + accessor().fea_dim(); + break; } - _param_config.slot_dim = _param_config.fea_dim - 2; - _param_config.tmp_push_dense_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); - ++t) { - _param_config.skip_op.push_back( - _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); + ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_key(i)] = + table.table_id(); } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); - ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); - std::vector tmp_sparse_variable_name; - for (int i = 0u; i < table.slot_value_size(); ++i) { - tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = - table.table_id(); - } - std::vector tmp_sparse_gradient_variable_name; - for (auto i = 0u; i < table.slot_gradient_size(); ++i) { - tmp_sparse_gradient_variable_name.push_back( - table.slot_gradient(i)); - } - _param_config.slot_input_vec[table.table_id()] = - std::move(tmp_sparse_variable_name); - _param_config.gradient_var[table.table_id()] = - std::move(tmp_sparse_gradient_variable_name); - _param_config.sparse_table_id.push_back(table.table_id()); + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back( + table.slot_gradient(i)); } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); - ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); - std::vector tmp_dense_variable_name; - for (int i = 0u; i < table.dense_variable_name_size(); ++i) { - tmp_dense_variable_name.push_back(table.dense_variable_name(i)); - } - std::vector tmp_dense_gradient_variable_name; - for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { - tmp_dense_gradient_variable_name.push_back( - table.dense_gradient_variable_name(i)); - } - _param_config.dense_variable_name[table.table_id()] = - std::move(tmp_dense_variable_name); - _param_config.dense_gradient_variable_name[table.table_id()] = - std::move(tmp_dense_gradient_variable_name); - _param_config.dense_table_id.push_back(table.table_id()); - _param_config.dense_table_size.push_back(table.fea_dim()); + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); + ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); } + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); + } } void AsyncExecutor::InitModel() { - for (auto table_id : _param_config.dense_table_id) { - std::vector regions; - for (auto& t : _param_config.dense_variable_name[table_id]) { - Variable* var = root_scope_->FindVar(t); - CHECK(var != nullptr) << "var[" << t << "] not found"; - LoDTensor* tensor = var->GetMutable(); - - float* g = tensor->data(); - CHECK(g != nullptr) << "var[" << t << "] value not initialized"; - - float init_range = 0.2; - int rown = tensor->dims()[0]; - init_range /= sqrt(rown); - - std::normal_distribution ndistr(0.0, 1.0); - for (auto i = 0u; i < tensor->numel(); ++i) { - g[i] = ndistr(local_random_engine()) * init_range; - } - - paddle::ps::Region reg(g, tensor->numel()); - regions.emplace_back(std::move(reg)); - } - - auto push_status = - _pslib_ptr->_worker_ptr->push_dense_param( - regions.data(), regions.size(), table_id); - push_status.wait(); - auto status = push_status.get(); - if (status != 0) { - LOG(FATAL) << "push dense param failed, status[" << status << "]"; - exit(-1); - } + for (auto table_id : _param_config.dense_table_id) { + std::vector regions; + for (auto& t : _param_config.dense_variable_name[table_id]) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); } + + auto push_status = + _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } } void AsyncExecutor::SaveModel(const std::string& path) { - auto ret = _pslib_ptr->_worker_ptr->flush(); - ret.wait(); - ret = _pslib_ptr->_worker_ptr->save(path, 0); - ret.wait(); - int32_t feasign_cnt = ret.get(); - if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 - LOG(FATAL) << "save model failed"; - exit(-1); - } + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 + LOG(FATAL) << "save model failed"; + exit(-1); + } } void AsyncExecutor::PrepareDenseThread(const std::string& mode) { - if (mode == "mpi") { - DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1; - param.training_thread_num = actual_thread_num; - param.root_scope = root_scope_; - param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr( - new DensePullThread(param)); - _pull_dense_thread->start(); - } + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1; + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = std::shared_ptr( + new DensePullThread(param)); + _pull_dense_thread->start(); + } } #endif diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index d6f16d91338..12642126411 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -45,7 +45,8 @@ inline std::default_random_engine& local_random_engine() { engine_wrapper_t() { static std::atomic x(0); std::seed_seq sseq = {x++, x++, x++, - static_cast(current_realtime() * 1000)}; + static_cast( + current_realtime() * 1000)}; engine.seed(sseq); } }; @@ -77,6 +78,7 @@ class AsyncExecutor { void SaveModel(const std::string& path); void InitParamConfig(); #endif + private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -87,6 +89,7 @@ class AsyncExecutor { #ifdef PADDLE_WITH_PSLIB void PrepareDenseThread(const std::string& mode); #endif + public: #ifdef PADDLE_WITH_PSLIB std::shared_ptr _pslib_ptr; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index a58c2692204..59679842bc1 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -33,87 +33,87 @@ namespace framework { #ifdef PADDLE_WITH_PSLIB int DensePullThread::start() { - _running = true; - _t = std::thread(&DensePullThread::run, this); - return 0; + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; } void DensePullThread::run() { - while (_running) { - _pull_dense_status.resize(0); - for (auto& t : _dense_variable_name) { - if (check_update_param(t.first)) { - auto status = pull_dense(t.first); - _pull_dense_status.emplace_back(std::move(status)); - reset_thread_version(t.first); - } - } - if (_pull_dense_status.size() != 0) { - wait_all(); - } - - usleep(_sleep_time_ms * 1000); + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); } + + usleep(_sleep_time_ms * 1000); + } } bool DensePullThread::check_update_param(uint64_t table_id) { - { - std::lock_guard lock(_mutex_for_version); - auto& version = _training_versions[table_id]; - _current_version[table_id] = - *(std::min_element(version.begin(), version.end())); - } - if (_current_version[table_id] - _last_versions[table_id] < _threshold) { - return false; - } - return true; + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; } void DensePullThread::reset_thread_version(uint64_t table_id) { - std::lock_guard lock(_mutex_for_version); - _last_versions[table_id] = _current_version[table_id]; + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; } std::future DensePullThread::pull_dense(uint64_t table_id) { - auto& regions = _regions[table_id]; - regions.clear(); - auto& variables = _dense_variable_name[table_id]; - regions.resize(variables.size()); - - for (auto i = 0u; i < variables.size(); ++i) { - auto& t = variables[i]; - Variable* var = _root_scope->FindVar(t); - LoDTensor* tensor = var->GetMutable(); - - float* w = tensor->data(); - paddle::ps::Region reg(w, tensor->numel()); - regions[i] = std::move(reg); - } - return _ps_client->pull_dense(regions.data(), regions.size(), table_id); + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); } void DensePullThread::wait_all() { - for (auto& t : _pull_dense_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << - ++_pull_dense_fail_times; - } + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << + ++_pull_dense_fail_times; } - - if (_pull_dense_fail_times > 20) { - LOG(FATAL) << "pull dense failed times more than 20 times"; - exit(-1); - } - - _pull_dense_status.resize(0); + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); } void DensePullThread::increase_thread_version( int thread_id, uint64_t table_id) { - std::lock_guard lock(_mutex_for_version); - _training_versions[table_id][thread_id]++; + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; } -#endif +#endif void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); @@ -336,56 +336,56 @@ void AsyncExecutorThreadWorker::TrainFiles() { void AsyncExecutorThreadWorker::SetPSlibPtr( std::shared_ptr pslib_ptr) { - _pslib_ptr = pslib_ptr; + _pslib_ptr = pslib_ptr; } void AsyncExecutorThreadWorker::SetPullDenseThread( std::shared_ptr dpt) { - _pull_dense_thread = dpt; + _pull_dense_thread = dpt; } void AsyncExecutorThreadWorker::TrainOneNetwork() { - PrepareParams(); - - for (auto& op : ops_) { - if (op->Type().find("sgd") != std::string::npos) { - continue; - } - bool need_skip = false; - for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != - std::string::npos) { - need_skip = true; - break; - } - } - if (!need_skip) { - op->Run(*thread_scope_, place_); - } + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != + std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); } - UpdateParams(); + } + UpdateParams(); } void AsyncExecutorThreadWorker::SetParamConfig( AsyncWorkerParamConfig* param_config) { - _param_config = param_config; + _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - for (auto table_id : _param_config->sparse_table_id) { - PullSparse(table_id); - for (auto& t : _pull_sparse_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(ERROR) << "pull sparse failed, status[" << status << "]"; - exit(-1); - } - } + for (auto table_id : _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } } - _pull_sparse_status.resize(0); + } + _pull_sparse_status.resize(0); - for (auto table_id : _param_config->sparse_table_id) { - FillSparse(table_id); - } + for (auto table_id : _param_config->sparse_table_id) { + FillSparse(table_id); + } } void AsyncExecutorThreadWorker::UpdateParams() { @@ -426,21 +426,20 @@ void AsyncExecutorThreadWorker::UpdateParams() { } void AsyncExecutorThreadWorker::PushDense(int table_id) { - std::vector regions; - for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { - Variable* var = thread_scope_->FindVar(t); - CHECK(var != nullptr) << "var[" << t << "] not found"; - LoDTensor* tensor = var->GetMutable(); - int count = tensor->numel(); - float* g = tensor->data(); - paddle::ps::Region reg(g, count); - regions.emplace_back(std::move(reg)); - } - - auto status = _pslib_ptr->_worker_ptr->push_dense( - regions.data(), regions.size(), table_id); - _push_dense_status.push_back(std::move(status)); - + std::vector regions; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense( + regions.data(), regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::PullSparse(int table_id) { @@ -643,24 +642,24 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( const std::vector& features, std::vector>& push_g, int dim) { - push_g.resize(features.size() + 1); - for (auto& t : push_g) { - t.resize(dim); - } + push_g.resize(features.size() + 1); + for (auto& t : push_g) { + t.resize(dim); + } } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector& push_g, - int dim) { - if (features.size() > push_g.size()) { - push_g.reserve(features.size() + 1); - auto size = features.size() - push_g.size() + 1; - for (auto i = 0u; i < size; ++i) { - float* ptr = new float[dim]; - push_g.push_back(ptr); - } + const std::vector& features, + std::vector& push_g, + int dim) { + if (features.size() > push_g.size()) { + push_g.reserve(features.size() + 1); + auto size = features.size() - push_g.size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g.push_back(ptr); } + } } #endif diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index c23eb09470d..20410b4c069 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -67,79 +67,79 @@ struct DensePullThreadParam { class DensePullThread { public: explicit DensePullThread(const DensePullThreadParam& param) : - _running(false) { - _ps_client = param.ps_client; - _threshold = param.threshold; - _thread_num = param.training_thread_num; - _root_scope = param.root_scope; - _sleep_time_ms = param.sleep_time_ms; - - for (auto& t : *param.dense_params) { - _dense_variable_name[t.first].insert( - _dense_variable_name[t.first].end(), - t.second.begin(), t.second.end()); - _training_versions[t.first].resize(_thread_num, 0); - _last_versions[t.first] = 0; - _current_version[t.first] = 0; - } + _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert( + _dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; } - - int start(); - - void stop() { - if (_running) { - _running = false; - _t.join(); - } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); } - - void increase_thread_version(int thread_id, uint64_t table_id); - void reset_thread_version(uint64_t table_id); - std::future pull_dense(uint64_t table_id); - void pull_dense2(uint64_t table_id); - void wait_all(); - + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + private: - void run(); - bool check_update_param(uint64_t table_id); - + void run(); + bool check_update_param(uint64_t table_id); + private: - std::shared_ptr _ps_client; - int _thread_num; - int _threshold; - int _sleep_time_ms; - Scope* _root_scope; - bool _running; - - std::map _last_versions; - std::map _current_version; - std::mutex _mutex_for_version; - std::map> _training_versions; - std::map> _dense_variable_name; - - std::thread _t; - - std::vector<::std::future> _pull_dense_status; - - std::map> _regions; - uint32_t _pull_dense_fail_times = 0; - - std::vector _base_norm_param; - std::vector _mean; - std::vector _scale; - float _squared_sum_epsilon = 1e-4; - std::mutex _mutex_for_mean_scale; - - float _total_batch_num = 0; + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; }; #endif class ExecutorThreadWorker { public: - ExecutorThreadWorker() - : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} +ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} virtual ~ExecutorThreadWorker() {} - + void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); void SetThreadId(int tid); @@ -160,7 +160,7 @@ class ExecutorThreadWorker { void SetFetchVarNames(const std::vector& fetch_var_names); #ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( - std::shared_ptr pslib_ptr) {}; + std::shared_ptr pslib_ptr) {} virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( @@ -218,32 +218,32 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { void check_pull_push_memory(const std::vector& features, std::vector>& push_g, int dim); - void collect_feasign_info(int table_id); - + void collect_feasign_info(int table_id); + private: - struct FeasignInfo { - uint32_t slot; - uint32_t ins; - int64_t label; - }; - - std::map> _features; - std::map> _fea_info; - std::map>> _feature_value; - std::map>> _feature_push_value; - - - std::shared_ptr _pslib_ptr; - - std::shared_ptr _pull_dense_thread; - - std::vector<::std::future> _pull_sparse_status; - std::vector<::std::future> _pull_dense_status; - std::vector<::std::future> _push_sparse_status; - std::vector<::std::future> _push_dense_status; - - AsyncWorkerParamConfig* _param_config; - + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; + }; #endif -- GitLab From e3c4b0dacee78d49a4701db788375b02d0916d6a Mon Sep 17 00:00:00 2001 From: SunGaofeng Date: Thu, 13 Dec 2018 15:46:12 +0800 Subject: [PATCH 0194/2367] this is for psroi_pool op, test=develop (#14796) * Add psroi_pool operator. --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/psroi_pool_op.cc | 173 +++++++++++ paddle/fluid/operators/psroi_pool_op.cu | 294 ++++++++++++++++++ paddle/fluid/operators/psroi_pool_op.h | 253 +++++++++++++++ python/paddle/fluid/layers/nn.py | 55 ++++ .../fluid/tests/unittests/test_layers.py | 10 + .../tests/unittests/test_psroi_pool_op.py | 134 ++++++++ 7 files changed, 920 insertions(+) create mode 100644 paddle/fluid/operators/psroi_pool_op.cc create mode 100644 paddle/fluid/operators/psroi_pool_op.cu create mode 100644 paddle/fluid/operators/psroi_pool_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_psroi_pool_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fd4cf92d85d..8e6482ca981 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -198,6 +198,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc new file mode 100644 index 00000000000..6978d9c5dc5 --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "the input of PSROIPoolOp. " + "The format of input tensor is NCHW. Where N is the batch size, " + "C is the number of input channels, " + "H is the height of the input feature map, and " + "W is the width."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]. " + "where (x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates. " + "The roi batch index can be calculated from LoD."); + AddOutput("Out", + "(Tensor), " + "the output of PSROIPoolOp is a 4-D Tensor with shape " + "(num_rois, output_channels, pooled_h, pooled_w)."); + AddAttr( + "output_channels", + "(int), " + "the number of channels of the output feature map. " + "For a task of C classes of objects, output_channels should be " + "(C + 1) for classification only."); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "the pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "the pooled output width.") + .SetDefault(1); + AddComment(R"Doc( +**PSROIPool Operator** + +Position sensitive region of interest pooling (also known as PSROIPooling) is to perform +position-sensitive average pooling on regions of interest specified by input, takes as +input N position-sensitive score maps and a list of num_rois regions of interest. + +PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details. + )Doc"); + } +}; + +class PSROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PSROIPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW"); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + PADDLE_ENFORCE(rois_dims[1] == 4, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + int output_channels = ctx->Attrs().Get("output_channels"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE( + input_dims[1] == output_channels * pooled_height * pooled_width, + "the channel of X(%d) should be equal to the product of " + "output_channels(%d), pooled_height(%d) and pooled_width(%d)", + input_dims[1], output_channels, pooled_height, pooled_width); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must be greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must be greater than 0"); + PADDLE_ENFORCE_GT(output_channels, 1, + "The pooled output channels must greater than 1"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0."); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = + output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class PSROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + psroi_pool, + ops::CPUPSROIPoolOpKernel, + ops::CPUPSROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + psroi_pool_grad, + ops::CPUPSROIPoolGradOpKernel, + ops::CPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu new file mode 100644 index 00000000000..22fec3244fa --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -0,0 +1,294 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void GPUPSROIPoolForward( + const int nthreads, const T* input_data, const T* input_rois, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + const T* offset_input_data = + input_data + + (roi_batch_id * input_channels + input_channel) * height * width; + T outsum = 0; + + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + outsum += offset_input_data[input_index]; + } + } + + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + output_data[i] = is_empty ? 0. : outsum / bin_area; + } +} + +template +__global__ void GPUPSROIPoolBackward( + const int nthreads, const T* input_rois, const T* output_grad_data, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); + } + } + } +} + +template +class GPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + int rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and input(X) batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + // set rois batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + // call cuda kernel function + GPUPSROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + input_channels, height, width, output_channels, pooled_height, + pooled_width, rois_batch_id_list_gpu.data(), + out->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + int rois_num = rois->dims()[0]; + int input_channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (input_grad) { + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + input_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); + + int output_grad_size = output_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUPSROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), output_grad->data(), + spatial_scale, input_channels, height, width, output_channels, + pooled_height, pooled_width, rois_batch_id_list_gpu.data(), + input_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + psroi_pool, + ops::GPUPSROIPoolOpKernel, + ops::GPUPSROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + psroi_pool_grad, + ops::GPUPSROIPoolGradOpKernel, + ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h new file mode 100644 index 00000000000..1a424728f7f --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -0,0 +1,253 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_channels = ctx.Attr("output_channels"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute bin size w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + int out_plane_offset = out_roi_offset + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + int out_row_offset = out_plane_offset + ph * out_stride[2]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + int output_index = out_row_offset + pw; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_plane_offset = + roi_batch_id * in_stride[0] + input_channel * in_stride[1]; + const T* offset_input_data = input_data + input_plane_offset; + T out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[2] + iw; + out_sum += offset_input_data[input_index]; + } + } + T bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } + return; + } +}; + +template +class CPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + if (input_grad) { + auto in_dims = in->dims(); + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + const T* input_rois = rois->data(); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + + // set gradient of X to be 0. before backpropagate. + math::SetConstant set_zero; + set_zero(ctx.template device_context(), input_grad, + static_cast(0)); + + // backpropagate gradient per output pixel + int output_grad_size = output_grad->numel(); + for (int i = 0; i < output_grad_size; ++i) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + offset_input_grad_data[input_index] += diff_val; + } + } + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fda..3832cae8c35 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -173,6 +173,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'psroi_pool', ] kIgnoreIndex = -100 @@ -9122,3 +9123,57 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +@templatedoc() +def psroi_pool(input, + rois, + output_channels, + spatial_scale, + pooled_height, + pooled_width, + name=None): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + output_channels (integer): ${output_channels_comment} + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + name (str, default None): The name of this layer. + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7) + """ + helper = LayerHelper('psroi_pool', **locals()) + # check attrs + if not isinstance(output_channels, int): + raise TypeError("output_channels must be int type") + if not isinstance(spatial_scale, float): + raise TypeError("spatial_scale must be float type") + if not isinstance(pooled_height, int): + raise TypeError("pooled_height must be int type") + if not isinstance(pooled_width, int): + raise TypeError("pooled_width must be int type") + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='psroi_pool', + inputs={'X': input, + 'ROIs': rois}, + outputs={'Out': out}, + attrs={ + 'output_channels': output_channels, + 'spatial_scale': spatial_scale, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width + }) + return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a866..fb3e4da1efd 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -511,6 +511,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_psroi_pool(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[245, 30, 30], dtype="float32") + rois = layers.data( + name="rois", shape=[4], dtype="float32", lod_level=1) + output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7) + self.assertIsNotNone(output) + print(str(program)) + def test_roi_align(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py new file mode 100644 index 00000000000..abe014a38c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py @@ -0,0 +1,134 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import math +import numpy as np +import unittest +from op_test import OpTest + + +class TestPSROIPoolOp(OpTest): + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_psroi_pool() + self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.attrs = { + 'output_channels': self.output_channels, + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width + } + self.outputs = {'Out': self.outs} + + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 * 2 * 2 + self.height = 6 + self.width = 4 + + self.x_dim = [self.batch_size, self.channels, self.height, self.width] + + self.spatial_scale = 1.0 / 4.0 + self.output_channels = 3 + self.pooled_height = 2 + self.pooled_width = 2 + + self.x = np.random.random(self.x_dim).astype('float32') + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y1 = np.random.random_integers( + 0, self.height // self.spatial_scale - self.pooled_height) + + x2 = np.random.random_integers(x1 + self.pooled_width, + self.width // self.spatial_scale) + y2 = np.random.random_integers( + y1 + self.pooled_height, self.height // self.spatial_scale) + roi = [bno, x1, y1, x2, y2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype('float32') + + def calc_psroi_pool(self): + output_shape = (self.rois_num, self.output_channels, self.pooled_height, + self.pooled_width) + out_data = np.zeros(output_shape) + for i in range(self.rois_num): + roi = self.rois[i] + roi_batch_id = int(roi[0]) + roi_start_w = round(roi[1]) * self.spatial_scale + roi_start_h = round(roi[2]) * self.spatial_scale + roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale + roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale + + roi_height = max(roi_end_h - roi_start_h, 0.1) + roi_width = max(roi_end_w - roi_start_w, 0.1) + + bin_size_h = roi_height / float(self.pooled_height) + bin_size_w = roi_width / float(self.pooled_width) + + x_i = self.x[roi_batch_id] + + for c in range(self.output_channels): + for ph in range(self.pooled_height): + for pw in range(self.pooled_width): + hstart = int( + math.floor(float(ph) * bin_size_h + roi_start_h)) + wstart = int( + math.floor(float(pw) * bin_size_w + roi_start_w)) + hend = int( + math.ceil( + float(ph + 1) * bin_size_h + roi_start_h)) + wend = int( + math.ceil( + float(pw + 1) * bin_size_w + roi_start_w)) + hstart = min(max(hstart, 0), self.height) + hend = min(max(hend, 0), self.height) + wstart = min(max(wstart, 0), self.width) + wend = min(max(wend, 0), self.width) + + c_in = (c * self.pooled_height + ph + ) * self.pooled_width + pw + is_empty = (hend <= hstart) or (wend <= wstart) + out_sum = 0. + for ih in range(hstart, hend): + for iw in range(wstart, wend): + out_sum += x_i[c_in, ih, iw] + bin_area = (hend - hstart) * (wend - wstart) + out_data[i, c, ph, pw] = 0. if is_empty else ( + out_sum / float(bin_area)) + self.outs = out_data.astype('float32') + + def setUp(self): + self.op_type = 'psroi_pool' + self.set_data() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() -- GitLab From 7b10bf0e60e9ac0f56ff532fe58cbf5c538a81b6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 16:40:51 +0800 Subject: [PATCH 0195/2367] Use mkl --- .../fluid/operators/hierarchical_sigmoid_op.h | 28 ++++++++++++------- paddle/fluid/operators/math/blas.h | 8 ++++++ paddle/fluid/operators/math/blas_impl.h | 21 ++++++++++++++ paddle/fluid/platform/dynload/mklml.h | 2 ++ 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89e..d212e6f8437 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { label.data())); } - auto& place = *ctx.template device_context().eigen_device(); - auto pre_out_mat = EigenMatrix::From(pre_out); - auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); - auto out_grad_mat = EigenMatrix::From(out_grad); + // softrelu derivative - Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; + auto blas = math::GetBlas(ctx); - // softrelu derivative - pre_out_grad_mat.device(place) = - static_cast(1.0) - static_cast(1.0) / pre_out_mat.exp(); + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - pre_out_grad_mat.device(place) = - pre_out_grad_mat * out_grad_mat.broadcast(bcast); + auto* out_grad_data = out_grad.data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f3a81f22cc..f67f57827bc 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -181,6 +181,9 @@ class Blas { const framework::Tensor& mat_b, const MatDescriptor& dim_b, T alpha, framework::Tensor* mat_out, T beta) const; + template + void VINV(int n, const T* a, T* y) const; + private: const DeviceContext& context_; }; @@ -282,6 +285,11 @@ class BlasT : private Blas { Base()->template BatchedGEMM(args...); } + template + void VINV(ARGS... args) const { + Base()->template VINV(args...); + } + private: const Blas* Base() const { return static_cast*>(this); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index c84087bb1e4..972366bc093 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -118,6 +118,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vsPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vsInv(args...); + } }; template <> @@ -213,6 +218,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vdPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vdInv(args...); + } }; #else @@ -603,6 +613,17 @@ void Blas::MatMul(const framework::Tensor &mat_a, dim_a.stride_, dim_b.stride_); } } +template +template +void Blas::VINV(int n, const T *a, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VINV(n, a, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = 1.0 / a[i]; + } +#endif +} } // namespace math } // namespace operators diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index f0a97366236..c3f9433503a 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -82,6 +82,8 @@ extern void* mklml_dso_handle; __macro(vdSqr); \ __macro(vsPowx); \ __macro(vdPowx); \ + __macro(vsInv); \ + __macro(vdInv); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); -- GitLab From 47ea2534fb9cac31f1b5c15c54112e6105810cb1 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 17:12:38 +0800 Subject: [PATCH 0196/2367] clean parallel do test=develop --- .../operators/controlflow/parallel_do_op.cc | 426 ------------------ python/paddle/fluid/backward.py | 79 +--- python/paddle/fluid/framework.py | 4 +- python/paddle/fluid/layers/control_flow.py | 152 +------ .../tests/book/notest_understand_sentiment.py | 18 +- .../fluid/tests/book/test_recognize_digits.py | 15 +- .../paddle/fluid/tests/book/test_word2vec.py | 14 +- .../test_memopt_fit_a_line.py | 87 ---- .../fluid/tests/unittests/test_parallel_op.py | 235 ---------- .../memory_optimization_transpiler.py | 5 +- 10 files changed, 10 insertions(+), 1025 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/parallel_do_op.cc delete mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_op.py diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc deleted file mode 100644 index ab25628d456..00000000000 --- a/paddle/fluid/operators/controlflow/parallel_do_op.cc +++ /dev/null @@ -1,426 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/safe_ref.h" - -namespace paddle { -namespace operators { - -static constexpr char kInputs[] = "inputs"; -static constexpr char kParameters[] = "parameters"; -static constexpr char kPlaces[] = "places"; - -static constexpr char kOutputs[] = "outputs"; -static constexpr char kParallelScopes[] = "parallel_scopes"; - -static constexpr char kParallelBlock[] = "sub_block"; -static constexpr char kUseNCCL[] = "use_nccl"; - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -static void SplitTensorAndMoveTensorToScopes( - const framework::Scope &scope, std::vector *sub_scopes, - const std::vector &places, - const std::vector &names) { - size_t num_sub_scopes = 0; - for (auto &argu : names) { - const auto &tensor = - detail::Ref(scope.FindVar(argu), - "Cannot find variable %s in the parent scope", argu) - .Get(); - auto lod_tensors = tensor.SplitLoDTensor(places); - - for (auto &lod : lod_tensors) { - VLOG(3) << lod.dims(); - } - if (num_sub_scopes == 0) { - num_sub_scopes = lod_tensors.size(); - } else { - PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size()); - } - PADDLE_ENFORCE_NE(num_sub_scopes, 0); - if (sub_scopes->size() == 0) { - sub_scopes->reserve(num_sub_scopes); - for (size_t i = 0; i < num_sub_scopes; ++i) { - sub_scopes->emplace_back(&scope.NewScope()); - } - } - - for (size_t i = 0; i < lod_tensors.size(); ++i) { - *detail::Ref(sub_scopes->at(i)->Var(argu), - "Cannot find variable in the sub-scope", argu) - .GetMutable() = lod_tensors[i]; - } - } -} - -inline void CopyOrShare(const framework::Variable &src, - const platform::Place &dst_place, - framework::Variable *dst) { - if (src.IsType()) { - if (src.Get().place() == dst_place) { - dst->GetMutable()->ShareDataWith(src.Get()); - dst->GetMutable()->set_lod(src.Get().lod()); - } else { - TensorCopy(src.Get(), dst_place, dst->GetMutable()); - } - } else if (src.IsType()) { - auto &src_sr = src.Get(); - auto *dst_sr = dst->GetMutable(); - dst_sr->set_height(src_sr.height()); - if (src_sr.value().place() == dst_place) { - dst_sr->mutable_value()->ShareDataWith(src_sr.value()); - dst_sr->set_rows(src_sr.rows()); - } else { - TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value()); - } - } else { - PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); - } -} - -void WaitOnPlace(const platform::Place place) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); -} - -void WaitOnPlaces(const std::vector places) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - - for (auto &place : places) { - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); - } -} - -class ParallelDoOp : public framework::OperatorBase { - public: - ParallelDoOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - auto &sub_scopes = *scope.FindVar(Output(kParallelScopes)) - ->GetMutable>(); - - // split input - SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places, - Inputs(kInputs)); - - // copy parameter - for (auto ¶m : Inputs(kParameters)) { - PADDLE_ENFORCE(scope.FindVar(param)->IsType(), - "Only support parameter type as LoDTensor"); - auto &src = scope.FindVar(param)->Get(); - - auto *sub_scope0 = sub_scopes[0]; - auto *dst0 = sub_scope0->Var(param)->GetMutable(); - dst0->ShareDataWith(src); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *sub_scope = sub_scopes[i]; - auto *dst = sub_scope->Var(param)->GetMutable(); - framework::TensorCopy(src, place, dst); - } - } - WaitOnPlaces(places); - - std::vector> workers; - workers.reserve(places.size()); - for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { - auto &place = places[place_idx]; - auto *cur_scope = sub_scopes[place_idx]; - - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // merge output - for (auto &o_name : Outputs(kOutputs)) { - std::vector lod_tensors; - lod_tensors.reserve(sub_scopes.size()); - for (auto *sub_scope : sub_scopes) { - lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get()); - } - - auto *lod_tensor_to_be_merged = - scope.FindVar(o_name)->GetMutable(); - lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace()); - } - WaitOnPlaces(places); - } -}; - -class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kInputs, "").AsDuplicable(); - AddInput(kParameters, "").AsDuplicable(); - AddInput(kPlaces, ""); - AddOutput(kOutputs, "").AsDuplicable(); - AddOutput(kParallelScopes, ""); - AddAttr(kParallelBlock, ""); - AddAttr(kUseNCCL, "true if we use nccl on backward") - .SetDefault(false); - AddComment(R"DOC( -ParallelDo Operator. -)DOC"); - } -}; - -class ParallelDoGradOp : public framework::OperatorBase { - public: - ParallelDoGradOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) - ->Get>(); - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - // feed output@grad - SplitTensorAndMoveTensorToScopes( - scope, const_cast *>(&sub_scopes), - places, Inputs(framework::GradVarName(kOutputs))); - WaitOnPlaces(places); - - // exe run - std::vector> workers; - for (size_t i = 0; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *cur_scope = sub_scopes[i]; - - // execute - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // NCCL allreduce op will be added by backward, - // so no need to explicitly accumulate grad - if (!(Attr(kUseNCCL))) { - AccumulateGrad(scope, place, sub_scopes, places); - } else { - for (auto &place : places) { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "NCCL only supports cuda place"); - } - } - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Moving " << s; - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } - - void AccumulateGrad(const framework::Scope &scope, - const platform::Place &place, - const std::vector &sub_scopes, - const platform::PlaceList &places) const { - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Accumulating " << s; - if (s == framework::kEmptyVarName) continue; - std::string tmp_name; - auto *tmp = sub_scopes[0]->Var(&tmp_name); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); - WaitOnPlaces(places); - - auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, - framework::AttributeMap{{"use_mkldnn", {false}}}); - VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); - sum_op->Run(*sub_scopes[0], places[0]); - WaitOnPlace(places[0]); - } - - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } -}; - -std::ostream &operator<<(std::ostream &sout, - const std::vector &strs) { - std::copy(strs.begin(), strs.end(), - std::ostream_iterator(sout, ",")); - return sout; -} - -class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - virtual std::unique_ptr Apply() const { - auto *grad = new framework::OpDesc(); - grad->SetType("parallel_do_grad"); - for (auto &input_param : this->InputNames()) { - VLOG(3) << input_param; - grad->SetInput(input_param, this->Input(input_param)); - if (input_param != kPlaces) { - grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param, false)); - } - } - auto *g_block = this->grad_block_[0]; - - // All variable name that needed by gradient operators - std::unordered_set all_inputs_in_grad_blocks; - - for (size_t i = 0; i < g_block->OpSize(); ++i) { - auto *op = g_block->Op(i); - for (auto &var_name : op->InputArgumentNames()) { - all_inputs_in_grad_blocks.insert(var_name); - } - } - - for (auto &output_param : this->OutputNames()) { - if (output_param == kParallelScopes) { - grad->SetInput(output_param, this->Output(output_param)); - grad->SetInput(framework::GradVarName(output_param), - this->Output(output_param)); - } else { - grad->SetInput(output_param, this->Output(output_param)); - std::vector og_names; - for (auto &og_name : this->OutputGrad(output_param)) { - if (all_inputs_in_grad_blocks.count(og_name) != 0) { - // there are some gradient operators who need the OG. So make this - // OG as an input of parallel.do - og_names.push_back(og_name); - } - // else, there is no operator who need the OG. Do not use this OG as - // an input - } - grad->SetInput(framework::GradVarName(output_param), og_names); - } - } - grad->SetInput("Communicator", {"nccl_com__do_not_change_"}); - grad->SetAttrMap(this->Attrs()); - grad->SetBlockAttr(kParallelBlock, grad_block_[0]); - - return std::unique_ptr(grad); - } -}; - -class ParallelDoGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInputs(kParameters)); - PADDLE_ENFORCE(ctx->HasInputs(kInputs)); - PADDLE_ENFORCE(ctx->HasInputs(kOutputs)); - - ctx->SetOutputsDim(framework::GradVarName(kParameters), - ctx->GetInputsDim(kParameters)); - - auto i_dims = ctx->GetInputsDim(kInputs); - auto ig_names = ctx->Outputs(framework::GradVarName(kInputs)); - - for (size_t i = 0; i < ig_names.size(); ++i) { - auto &ig_name = ig_names[i]; - if (ig_name == framework::kEmptyVarName) { - continue; - } - - ctx->SetDims({ig_name}, {i_dims[i]}); - } - - auto p_dims = ctx->GetInputsDim(kParameters); - auto pg_names = ctx->Outputs(framework::GradVarName(kParameters)); - for (size_t i = 0; i < pg_names.size(); ++i) { - auto &pg_name = pg_names[i]; - if (pg_name == framework::kEmptyVarName) { - continue; - } - ctx->SetDims({pg_name}, {p_dims[i]}); - } - } -}; - -class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - framework::BlockDesc *sub_block = - boost::get(op_desc.GetAttr(kParallelBlock)); - for (auto &out_vars : op_desc.Outputs()) { - for (auto &out_var : out_vars.second) { - auto &var = block->FindRecursiveOrCreateVar(out_var); - auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); - if (sub_var.GetType() != var.GetType()) { - var.SetType(sub_var.GetType()); - } - } - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, - paddle::operators::ParallelDoOpProtoMaker, - paddle::operators::ParallelDoGradOpDescMaker); -REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference, - paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 17fe8dc3c8a..b2c3e7c989c 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -249,69 +249,6 @@ def serialize_op_decs(op_desc): return proto.__str__() -def _callback_lookup_(op): - """ - Only used in _append_backward_ops_ - Build and returns a callback function for certain op. For example - - parallel_do: AllReduce - - :param op: - :return: callback function - """ - if op.type == 'parallel_do' and op.attr('use_nccl'): - all_vars = op.block.vars - param_names = set(op.input('parameters')) - param_names = [ - name for name in param_names - if all_vars[name].stop_gradient is False - ] - param_grad_names = [n + "@GRAD" for n in param_names] - - class ParallelDoCallBack(object): - def __init__(self, param_grad_names, parallel_scopes_name): - self.has_inserted_nccl_init = False - self.param_grad_names = param_grad_names - self.parallel_scopes_name = parallel_scopes_name - - def __call__(self, block, context): - if not self.has_inserted_nccl_init: - op_desc = _create_op_desc_( - "ncclInit", - {"parallel_scopes": self.parallel_scopes_name}, - {"Communicator": ['nccl_com__do_not_change_']}, {}) - block.program.global_block().desc.append_op().copy_from( - op_desc) - self.has_inserted_nccl_init = True - - current_op_desc = context["__current_op_desc__"] - for o_param in current_op_desc.output_names(): - for o_argu in current_op_desc.output(o_param): - if o_argu in self.param_grad_names: - allreduce_out_name = o_argu + "__nccl_all_reduce__" - op_desc = _create_op_desc_( - "ncclReduce", - { - "X": [o_argu], - "Communicator": - ['nccl_com__do_not_change_'] - }, - {"Out": [allreduce_out_name]}, - {"reduction": "ncclSum", - "root": 0}, ) - block.desc.append_op().copy_from(op_desc) - - op_desc = _create_op_desc_( - "assign", {"X": [allreduce_out_name]}, - {"Out": [o_argu]}, {}) - block.desc.append_op().copy_from(op_desc) - - return ParallelDoCallBack(param_grad_names, - op.output("parallel_scopes")) - else: - return None - - def _append_backward_ops_(block, ops, target_block, @@ -349,17 +286,8 @@ def _append_backward_ops_(block, sub_block = program.block(op._block_attr_id("sub_block")) grad_sub_block = program._create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) - cb = _callback_lookup_(op) - if cb is not None: - if callbacks is None: - new_callbacks = [cb] - else: - new_callbacks = callbacks + [_callback_lookup_(op)] - _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, new_callbacks) - else: - _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, callbacks) + _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, + no_grad_dict, grad_to_var, callbacks) program._rollback() grad_sub_block_list.append(grad_sub_block.desc) @@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): # infer_shape and infer_type op_desc.infer_var_type(block.desc) op_desc.infer_shape(block.desc) - # ncclInit dones't need to set data_type - if op_desc.type() == 'ncclInit': - continue for arg in op_desc.output_arg_names(): if arg in new_vars: _infer_var_data_type_(arg, block) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 08979205946..d0bd78454db 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -563,8 +563,8 @@ class Operator(object): OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', - 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' + 'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select', + 'checkpoint_notify', 'gen_nccl_id' } def __init__(self, diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index b7e39685691..21454370dd2 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -226,156 +226,6 @@ class BlockGuard(object): return True -class ParallelDo(object): - """ - ParallelDo is used to represent multi-thread data parallel processing. - - Its vanilla implementation can be shown as the following (:math:`|` means - single thread and :math:`||||` means multiple threads) - - .. code-block:: text - - In the forward pass - | Split input onto different devices - | Copy parameter onto different devices - |||| Compute forward pass in parallel - | Merge output from different devices - - In the backward pass - | Split output@grad onto different devices - |||| Compute backward pass in parallel - | accumulate param@grad from different devices to the first device - | Merge input@grad from different devices - | Copy param@grad to the place of parallel_do_op - - Examples: - - .. code-block:: python - - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # ParallelDo version & Single-thread version - if thread_num > 1: - places = fluid.layers.get_places(thread_num) - pd = fluid.layers.control_flow.ParallelDo(places) - with pd.do(): - images = pd.read_input(images) - label = pd.read_input(label) - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - - avg_cost = fluid.layers.mean(x=cost) - pd.write_output(avg_cost) - - avg_cost = pd() - avg_cost = fluid.layers.mean(avg_cost) - else: - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - .. warning:: - - It will be soon deprecated, please use ParallelExecutor instead. - """ - - def __init__(self, places, use_nccl=False, name=None): - warnings.warn( - "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.", - Warning) - self.helper = LayerHelper("parallel_do", name=name) - self.inputs = [] - self.places = places - self.outputs = [] - self.status = StaticRNN.BEFORE_RNN_BLOCK - self.use_nccl = use_nccl - - def do(self): - return BlockGuardWithCompletion(self) - - def parent_block(self): - prog = self.helper.main_program - parent_idx = prog.current_block().parent_idx - assert parent_idx >= 0 - parent_block = prog.block(parent_idx) - return parent_block - - def __call__(self, *args, **kwargs): - if self.status != StaticRNN.AFTER_RNN_BLOCK: - raise ValueError("RNN output can only be retrieved after rnn block") - if len(self.outputs) == 0: - raise ValueError("RNN has no output") - elif len(self.outputs) == 1: - return self.outputs[0] - else: - return self.outputs - - def read_input(self, var): - self.inputs.append(var) - return var - - def write_output(self, var): - self.outputs.append(var) - - def get_parameters(self): - main_program = self.helper.main_program - current_block = main_program.current_block() - parent_block = self.parent_block() - - local_inputs = set() - params = list() - for var in self.inputs: - local_inputs.add(var.name) - - for op in current_block.ops: - for iname in op.input_names: - for in_var_name in op.input(iname): - if in_var_name not in local_inputs: - params.append(in_var_name) - - for oname in op.output_names: - for out_var_name in op.output(oname): - local_inputs.add(out_var_name) - - params = list(set(params)) - - return [parent_block.var(name) for name in params] - - def _complete_op(self): - main_program = self.helper.main_program - current_block = main_program.current_block() - parent_block = self.parent_block() - - step_scope = parent_block.create_var( - type=core.VarDesc.VarType.STEP_SCOPES) - - self.outputs = [ - parent_block.create_var( - name=o.name, - shape=o.shape, - dtype=o.dtype, - lod_level=o.lod_level, - persistable=o.persistable, - stop_gradient=o.stop_gradient) for o in self.outputs - ] - - inputs = [parent_block.var(i.name) for i in self.inputs] - outputs = [parent_block.var(o.name) for o in self.outputs] - - parent_block.append_op( - type='parallel_do', - inputs={ - 'inputs': inputs, - 'parameters': self.get_parameters(), - 'places': self.places - }, - outputs={'outputs': outputs, - 'parallel_scopes': [step_scope]}, - attrs={'sub_block': current_block, - 'use_nccl': self.use_nccl}) - - class BlockGuardWithCompletion(BlockGuard): """ BlockGuardWithCompletion class. @@ -384,7 +234,7 @@ class BlockGuardWithCompletion(BlockGuard): """ def __init__(self, rnn): - if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)): + if not isinstance(rnn, StaticRNN): raise TypeError( "BlockGuardWithCompletion takes a StaticRNN or ParallelDo") super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program) diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index a666507bd9a..5658bb4ec44 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -15,7 +15,6 @@ from __future__ import print_function from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo import unittest import paddle.fluid as fluid import paddle @@ -147,22 +146,7 @@ def train(word_dict, cost, acc_out, prediction = net_method( data, label, input_dim=dict_dim, class_dim=class_dim) else: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - cost, acc, _ = net_method( - pd.read_input(data), - pd.read_input(label), - input_dim=dict_dim, - class_dim=class_dim) - pd.write_output(cost) - pd.write_output(acc) - - cost, acc = pd() - cost = fluid.layers.mean(cost) - acc_out = fluid.layers.mean(acc) - prediction = None - assert save_dirname is None + raise NotImplementedError() adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad.minimize(cost) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 4a70976a483..54936519ce0 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -25,7 +25,6 @@ import numpy import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo BATCH_SIZE = 64 @@ -82,19 +81,7 @@ def train(nn_type, net_conf = conv_net if parallel: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - img_ = pd.read_input(img) - label_ = pd.read_input(label) - prediction, avg_loss, acc = net_conf(img_, label_) - for o in [avg_loss, acc]: - pd.write_output(o) - - avg_loss, acc = pd() - # get mean loss and acc through every devices. - avg_loss = fluid.layers.mean(avg_loss) - acc = fluid.layers.mean(acc) + raise NotImplementedError() else: prediction, avg_loss, acc = net_conf(img, label) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 9191f0fc203..08f70c9cabc 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -17,7 +17,6 @@ from __future__ import print_function import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo import unittest import os import numpy as np @@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - avg_cost, predict_word = __network__( - list( - map(pd.read_input, [ - first_word, second_word, third_word, forth_word, - next_word - ]))) - pd.write_output(avg_cost) - - avg_cost = fluid.layers.mean(pd()) + raise NotImplementedError() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py deleted file mode 100644 index dab2a52bc90..00000000000 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import math -import sys - -import paddle -import paddle.fluid as fluid -from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo - -# need to fix random seed and training data to compare the loss -# value accurately calculated by the default and the memory optimization -# version. -fluid.default_startup_program().random_seed = 111 - -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -device_type = 'CPU' -use_nccl = False -place = fluid.CPUPlace() -if fluid.core.is_compiled_with_cuda(): - device_type = 'CUDA' - use_nccl = False - place = fluid.CUDAPlace(0) - -places = get_places(device_count=0, device_type=device_type) -pd = ParallelDo(places, use_nccl=use_nccl) -with pd.do(): - x_ = pd.read_input(x) - y_ = pd.read_input(y) - y_predict = fluid.layers.fc(input=x_, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y_) - avg_cost = fluid.layers.mean(x=cost) - pd.write_output(avg_cost) - -cost = pd() -avg_cost = fluid.layers.mean(x=cost) -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) -sgd_optimizer.minimize(avg_cost) - -fluid.memory_optimize(fluid.default_main_program(), print_log=True) -# fluid.release_memory(fluid.default_main_program()) - -BATCH_SIZE = 200 - -# fix the order of training data -train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) - -# train_reader = paddle.batch( -# paddle.reader.shuffle( -# paddle.dataset.uci_housing.train(), buf_size=500), -# batch_size=BATCH_SIZE) - -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) - -exe.run(fluid.default_startup_program()) - -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - - if avg_loss_value[0] < 10.0: - exit(0) # if avg cost less than 10.0, we think our code is good. - print(avg_loss_value[0]) - if math.isnan(float(avg_loss_value)): - sys.exit("got NaN loss, training failed.") -exit(1) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py deleted file mode 100644 index 380e1728442..00000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest - -import paddle.fluid as fluid -from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo -import paddle.fluid.profiler as profiler -import numpy -import six - - -class BaseParallelForTest(unittest.TestCase): - def run_test(self, callback, feed, fetch): - """ - Run the unittest for parallel.for - Args: - callback(callable): A callable function returns a generator. There - are two yields in the generator function. The first yield - returns the data layers, and the second yield returns the loss. - The modified data variables will be sent back during the first - yield. - - feed(dict): The executor feeding dictionary. - fetch(list|basestr): The fetch name lists. - - Returns: - None - - Raises: - AssertionError when the computation of cpu, parallel.for in cpu, - gpu, parallel.for in gpu are different. - - """ - cpu = fluid.CPUPlace() - result_cpu = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=cpu, - use_parallel=False) - result_cpu_parallel = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=cpu, - use_parallel=True) - if fluid.core.is_compiled_with_cuda(): - gpu = fluid.CUDAPlace(0) - result_gpu = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=False, - use_gpu=True) - result_gpu_parallel = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=True, - use_gpu=True) - result_gpu_nccl = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=True, - use_nccl=True, - use_gpu=True) - self._assert_same_(fetch, result_cpu, result_cpu_parallel, - result_gpu, result_gpu_parallel, result_gpu_nccl) - else: - self._assert_same_(fetch, result_cpu, result_cpu_parallel) - - def _run_test_impl_(self, - callback, - feed, - fetch, - place, - use_parallel=False, - use_nccl=False, - use_gpu=False): - """ - Run a single test, returns the fetch values - Args: - place(Place): the computation place. - use_parallel(bool): Whether use parallel.for or not. - - Returns: - Fetched numpy arrays. - - """ - if isinstance(fetch, six.string_types): - fetch = [fetch] - main = fluid.Program() - startup = fluid.Program() - # Fix seed - main.random_seed = 10 - startup.random_seed = 10 - - with fluid.program_guard(main, startup): - generator = callback() - # Automatically insert parallel do if use_parallel = True - if use_parallel: - thread_num = fluid.core.get_cuda_device_count( - ) if use_gpu else 8 - places = get_places(thread_num) - pd = ParallelDo(places, use_nccl=use_nccl) - data = next(generator) - - if isinstance(data, fluid.framework.Variable): - data = [data] - - with pd.do(): - ins = list(map(pd.read_input, data)) - if len(ins) == 1: - ins = ins[0] - loss = generator.send(ins) # patch input - pd.write_output(loss) - - loss = pd() - else: - data = next(generator) - loss = generator.send(data) - self.assertIsNotNone(loss) - avg_loss = fluid.layers.mean(loss) - fluid.backward.append_backward(loss=avg_loss) - - exe = fluid.Executor(place) - exe.run(startup) - if use_gpu: - profile_type = 'GPU' - else: - profile_type = 'CPU' - with profiler.profiler(profile_type, 'total', '/tmp/profiler'): - return exe.run(main, feed=feed, fetch_list=fetch) - - def _assert_same_(self, fetch, *args): - """ - Assert the return values of `run_test` are same. - Args: - fetch: Fetch list. Used for print error message - *args: The fetch result lists of each situations. - - Returns: - None - - Raises: - AssertionError - - """ - - def _impl_(a, b, fetch_id, item_id): - item_str = [ - 'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL' - ] - flag = numpy.allclose(a, b, rtol=0.1, atol=1e-3) - self.assertTrue(flag, - "The {0} are different in {1}, {2} vs {3}".format( - fetch[fetch_id], item_str[item_id], a, b)) - - for i, items in enumerate(zip(*args)): - self.assertGreater(len(items), 0) - for j in range(1, len(items)): - _impl_(items[0], items[j], fetch_id=i, item_id=j) - - -class ParallelOpTest(BaseParallelForTest): - @staticmethod - def __network__(): - x = fluid.layers.data(shape=[784], dtype='float32', name='img') - x = yield x - hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') - hidden = fluid.layers.batch_norm(input=hidden) - loss = fluid.layers.mean(hidden) - yield loss - - def test_simple_fc(self): - self.run_test( - callback=self.__network__, - feed={ - 'img': numpy.random.random(size=(51, 784)).astype('float32') - }, - fetch=['fc1.w@GRAD']) - - def test_fc_with_tiny_data(self): - self.run_test( - callback=self.__network__, - feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, - fetch=['fc1.w@GRAD']) - - -class ParallelOpTestMultipleInput(BaseParallelForTest): - @staticmethod - def __network__(): - x = fluid.layers.data( - shape=[784], dtype='float32', name='img1', stop_gradient=False) - y = fluid.layers.data( - shape=[784], dtype='float32', name='img2', stop_gradient=False) - yield [x, y] - x = x + y - hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') - hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w') - hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w') - loss = fluid.layers.mean(hidden3) - yield loss - - def test_simple_fc(self): - self.run_test( - callback=self.__network__, - feed={ - 'img1': numpy.random.random(size=(51, 784)).astype('float32'), - 'img2': numpy.random.random(size=(51, 784)).astype('float32') - }, - fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD']) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 95aafec0536..d10ea4e472f 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -35,11 +35,10 @@ dtype_to_size = { } SUB_BLOCK_OPS = [ - "while", "while_grad", "parallel_do", "parallel_do_grad", - "conditional_block", "conditional_block_grad" + "while", "while_grad", "conditional_block", "conditional_block_grad" ] -SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), +SUB_BLOCK_PAIR = [("while", "while_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False -- GitLab From 36da940bc1ec69f1bdcb1d83c473136dc070fd87 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 17:14:53 +0800 Subject: [PATCH 0197/2367] clean more test=develop --- python/paddle/fluid/layers/control_flow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 21454370dd2..9d98e8333ba 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -235,8 +235,7 @@ class BlockGuardWithCompletion(BlockGuard): def __init__(self, rnn): if not isinstance(rnn, StaticRNN): - raise TypeError( - "BlockGuardWithCompletion takes a StaticRNN or ParallelDo") + raise TypeError("BlockGuardWithCompletion takes a StaticRNN") super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program) self.rnn = rnn -- GitLab From fc6ec6bd1425b01a130cefe7411422e8eb62a95d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 17:43:53 +0800 Subject: [PATCH 0198/2367] add sparse mode adam --- paddle/fluid/operators/optimizers/adam_op.cc | 5 +++ paddle/fluid/operators/optimizers/adam_op.h | 41 +++++++++++++------ python/paddle/fluid/optimizer.py | 7 +++- .../fluid/tests/unittests/test_adam_op.py | 20 +++++---- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39ac..b2c2e5c3254 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-8) " "Constant for numerical stability") .SetDefault(1.0e-8f); + AddAttr( + "sparse_mode", + "(bool, default false) " + "only update the parameter that has gradient in sparse update") + .SetDefault(false); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e..ca5454ef040 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -177,12 +177,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; + bool sparse_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count) + int64_t row_numel, int64_t row_count, bool sparse_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -198,13 +199,10 @@ struct SparseAdamFunctor { param_out_(param_out), rows_(rows), row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + row_count_(row_count), + sparse_mode_(sparse_mode) {} + inline HOSTDEVICE void sparse_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -225,6 +223,13 @@ struct SparseAdamFunctor { moment2_out_[i] = mom2; param_out_[i] = p; } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + sparse_update(i, g); + } }; template @@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; + bool sparse_mode = ctx.Attr("sparse_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + grad_merge.rows().size(), sparse_mode); + if (sparse_mode) { + size_t row_count = grad_merge.rows().size(); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = rows[row_index] * row_numel + offset; + T g = grad_data[row_index * row_numel + offset]; + functor.sparse_update(i, g); + } + } + } else { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d410..9c7482bc40d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer): beta2=0.999, epsilon=1e-8, regularization=None, - name=None): + name=None, + sparse_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._sparse_mode = sparse_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer): attrs={ "beta1": self._beta1, "beta2": self._beta2, - "epsilon": self._epsilon + "epsilon": self._epsilon, + "sparse_mode": self._sparse_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 5318d2f9766..da91875a145 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -194,7 +194,8 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out -def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, + sparse_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place): + def setup(self, scope, place, sparse_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase): self.sparse_inputs = ["Grad"] - param_out, mom1, mom2 = adam_step_sparse( - self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, + height, rows, row_numel, + np_array, sparse_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place): + def check_with_place(self, place, sparse_mode): scope = core.Scope() - self.setup(scope, place) + self.setup(scope, place, sparse_mode) op_args = dict() + op_args['sparse_mode'] = sparse_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase): 0.00001) j += 1 - def test_sparse_sgd(self): + def test_sparse_adam(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - self.check_with_place(place) + for sparse_mode in (True, False): + self.check_with_place(place, sparse_mode) if __name__ == "__main__": -- GitLab From f0df62f136396794556a121344a719e4c6fb62ef Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 13 Dec 2018 09:44:40 +0000 Subject: [PATCH 0199/2367] add more unittest case test=develop --- paddle/fluid/operators/py_func_op.cc | 33 +++++++++++------- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/layers/nn.py | 34 +++++++++++++------ .../fluid/tests/unittests/test_py_func_op.py | 17 +++++++++- 4 files changed, 60 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 5d1aa7d7e65..1bee3d9351b 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -35,6 +35,9 @@ size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) { return g_py_callables.size() - 1; } +// Return py::object* instead of py::object +// Returning py::object would cause reference count increasing +// but without GIL, reference count in Python may not be safe static py::object *GetPythonCallableObject(size_t i) { PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id"); return &g_py_callables[i]; @@ -47,7 +50,7 @@ static std::string PythonObjectToString(const py::object &py_callable) { static void CallPythonFunc(py::object *callable, const std::vector &ins, - std::vector *out) { + std::vector *outs) { py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { @@ -57,8 +60,8 @@ static void CallPythonFunc(py::object *callable, auto ret = (*callable)(*in_args); auto ret_tuple = py::cast(ret); size_t ret_num = py::len(ret_tuple); - size_t out_num = out->size(); - if (ret_num != out_num) { + size_t out_num = outs->size(); + if (UNLIKELY(ret_num != out_num)) { // Python function has no return values or returns None // In this case, ret_num = 1 && ret[0] == None && out_num should be 0 // Otherwise, ret_num must be equal to out_num @@ -69,17 +72,18 @@ static void CallPythonFunc(py::object *callable, } for (size_t i = 0; i < out_num; ++i) { - if ((*out)[i] == nullptr) { + auto *out = (*outs)[i]; + if (out == nullptr) { continue; } try { - auto *out_tensor = py::cast(ret_tuple[i]); - PADDLE_ENFORCE_NOT_NULL(out_tensor, + auto *py_out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(py_out_tensor, "Output tensor %d should not be nullptr", i); - (*out)[i]->set_lod(out_tensor->lod()); - (*out)[i]->ShareDataWith(*out_tensor); + out->set_lod(py_out_tensor->lod()); + out->ShareDataWith(*py_out_tensor); } catch (py::cast_error &) { - PADDLE_THROW("Output %d is not LoDTensor", i); + PADDLE_THROW("The %d-th output must be LoDTensor", i); } } } @@ -94,6 +98,10 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, "Function id cannot be less than 0"); + // Transverse all outputs + // If name of any output ends with @GRAD, + // set its shape, dtype, lod_level, type to be the same as + // the correponding forward variable auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); const std::string kGradVarSuffix = framework::kGradVarSuffix; @@ -115,7 +123,7 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { auto *in_var_desc = block->FindVarRecursive(fwd_var_name); PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", fwd_var_name); - VLOG(10) << "Infer shape of Out(" << out_name << ") as Input(" + VLOG(10) << "Infer shape of Output(" << out_name << ") as Input(" << in_var_desc->Name() << ")"; out_var_desc->SetShape(in_var_desc->GetShape()); out_var_desc->SetDataType(in_var_desc->GetDataType()); @@ -135,7 +143,7 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { "Index of registered forward Python function.") .SetDefault(0); AddAttr(kBackwardPythonCallableId, - "Index of registered backward Python function") + "Index of registered backward Python function.") .SetDefault(-1); AddAttr>(kPyFuncBackwardSkipVars, "Unused forward in/out in backward op") @@ -170,8 +178,7 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { auto fwd_outs = Output("Out"); // For memory reused, some inputs/output in forward part may be not needed - // in backward part - // Just skip these vars + // in backward part. Skipping these vars helps to save memory auto &backward_skip_var_list = boost::get>( fwd_attrs.at(kPyFuncBackwardSkipVars)); std::unordered_set backward_skip_var_set( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 348a0739152..208efbea4a5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -104,7 +104,7 @@ PYBIND11_MODULE(core, m) { BindException(&m); m.def( - "append_python_callable_object_and_return_id", + "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index db7ec9d021f..3cd0a2887e5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9137,8 +9137,13 @@ class PyFuncRegistry(object): self._func = func # find named args using reflection - self._named_args = inspect.getargspec(self._func)[0] - self._id = core.append_python_callable_object_and_return_id(self) + args = inspect.getargspec(self._func) + if len(args[0]) == 0 and args[1] is None and args[2] is None: + # Function with no inputs + self._named_args = None + else: + self._named_args = args[0] + self._id = core._append_python_callable_object_and_return_id(self) ''' Why record self here? @@ -9168,13 +9173,16 @@ class PyFuncRegistry(object): return self._id def __call__(self, *args): - kwargs = dict() - idx = 0 - for arg in self._named_args: - kwargs[arg] = args[idx] - idx += 1 + if self._named_args is None: + func_ret = self._func() + else: + kwargs = dict() + idx = 0 + for arg in self._named_args: + kwargs[arg] = args[idx] + idx += 1 + func_ret = self._func(*args[idx:], **kwargs) - func_ret = self._func(*args[idx:], **kwargs) if not isinstance(func_ret, (list, tuple)): func_ret = (func_ret, ) @@ -9207,14 +9215,18 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): User should set the right data type and shape of :code:`out` before calling this function. However, data types and shapes of gradients of - :code:`out` and :code:`x` would be infered automatically. + :code:`out` and :code:`x` would be inferred automatically. - The orders of inputs of :code:`backward_func` would be: forward input - :code:`x`, forward output :code:`out` and backward input gradient of + Input orders of :code:`backward_func` would be: forward inputs + :code:`x`, forward outputs :code:`out` and backward input gradients of :code:`out`. If some variables of :code:`out` have no gradient, the input tensor would be None in Python side. If some variables of :code:`in` have no gradient, users should return None. + This function can also be used to debug the running network. User can + add a :code:`py_func` operator without output, and print input + :code:`x` inside :code:`func`. + Args: func (callable): forward Python function. x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 491bbc21902..943ad3ed224 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -25,6 +25,14 @@ if fluid.core.is_compiled_with_cuda(): os.environ['CPU_NUM'] = str(dev_cnt) +def dummy_func_with_no_input(): + return float(1.0) + + +def dummy_func_with_no_output(x): + pass + + def tanh(x): return np.tanh(x) @@ -86,13 +94,20 @@ def simple_fc_net(img, label, use_py_func_op): else: loss = fluid.default_main_program().current_block().create_var( name='loss', dtype='float32', shape=[-1, 1]) - fluid.layers.py_func( + loss = fluid.layers.py_func( func=cross_entropy, x=[prediction, label], out=loss, backward_func=cross_entropy_grad, skip_vars_in_backward_input=loss) + dummy_var = fluid.default_main_program().current_block().create_var( + name='test_tmp_var', dtype='float32', shape=[1]) + fluid.layers.py_func( + func=dummy_func_with_no_input, x=None, out=dummy_var) + + fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) + loss = fluid.layers.mean(loss) return loss -- GitLab From ad6ae0b071041c1f69c66c7c173733bfe7cb2752 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:39:46 +0800 Subject: [PATCH 0200/2367] 1. Add SpinLock 2. Seperate the lock of kids and vars in Scope test=develop --- CMakeLists.txt | 1 + cmake/external/robin_map.cmake | 31 +++++++ .../framework/details/execution_strategy.h | 2 +- .../scope_buffered_ssa_graph_executor.cc | 9 +- paddle/fluid/framework/operator.cc | 6 +- paddle/fluid/framework/rw_lock.h | 91 +++++-------------- paddle/fluid/framework/scope.cc | 58 ++++++------ paddle/fluid/framework/scope.h | 15 ++- paddle/fluid/framework/spin_lock.h | 71 +++++++++++++++ paddle/fluid/operators/optimizers/adam_op.h | 17 ---- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/optimizer.py | 43 +++++---- 12 files changed, 201 insertions(+), 145 deletions(-) create mode 100644 cmake/external/robin_map.cmake create mode 100644 paddle/fluid/framework/spin_lock.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e59aca2d93..2abbcef41a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,6 +215,7 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake new file mode 100644 index 00000000000..ddaf59536cb --- /dev/null +++ b/cmake/external/robin_map.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) +set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) + +include_directories(${ROBIN_MAP_INCLUDE_DIR}) + +ExternalProject_Add( + extern_robin_map + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Tessil/robin-map.git" + GIT_TAG "v0.5.0" + PREFIX ${ROBIN_MAP_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(robin_map STATIC ${dummyfile}) +else() + add_library(robin_map INTERFACE) +endif() + +add_dependencies(robin_map extern_robin_map) + +LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2..37b07e57363 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a9856..9ded0266a9b 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( : nullptr; #endif - if (!fetch_tensors.empty() || - drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; + if (!fetch_tensors.empty()) { // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); @@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } #endif } + } + + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7..58e5926f544 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dd918fcdfa6..75e6bef9bf3 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -31,17 +31,17 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } - void RDLock() { + inline void RDLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); } - void WRLock() { + inline void WRLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); } - void UNLock() { + inline void UNLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); } @@ -54,86 +54,43 @@ struct RWLock { // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { // FIXME(minqiyang): use mutex here to do fake lock - void RDLock() { mutex_.lock(); } + inline void RDLock() { mutex_.lock(); } - void WRLock() { mutex_.lock(); } + inline void WRLock() { mutex_.lock(); } - void UNLock() { mutex_.unlock(); } + inline void UNLock() { mutex_.unlock(); } private: std::mutex mutex_; }; #endif -class RWLockGuard { +class AutoWRLock { public: - enum Status { kUnLock, kWRLock, kRDLock }; - - RWLockGuard(RWLock* rw_lock, Status init_status) - : lock_(rw_lock), status_(Status::kUnLock) { - switch (init_status) { - case Status::kRDLock: { - RDLock(); - break; - } - case Status::kWRLock: { - WRLock(); - break; - } - case Status::kUnLock: { - break; - } - } - } + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - void WRLock() { - switch (status_) { - case Status::kUnLock: { - lock_->WRLock(); - status_ = Status::kWRLock; - break; - } - case Status::kWRLock: { - break; - } - case Status::kRDLock: { - PADDLE_THROW( - "Please unlock read lock first before invoking write lock."); - break; - } - } - } + inline void Lock() { lock_->WRLock(); } - void RDLock() { - switch (status_) { - case Status::kUnLock: { - lock_->RDLock(); - status_ = Status::kRDLock; - break; - } - case Status::kRDLock: { - break; - } - case Status::kWRLock: { - PADDLE_THROW( - "Please unlock write lock first before invoking read lock."); - break; - } - } - } + inline void UnLock() { lock_->UNLock(); } - void UnLock() { - if (status_ != Status::kUnLock) { - lock_->UNLock(); - status_ = Status::kUnLock; - } - } + ~AutoWRLock() { UnLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + inline void Lock() { lock_->RDLock(); } + + inline void UnLock() { lock_->UNLock(); } - ~RWLockGuard() { UnLock(); } + ~AutoRDLock() { UnLock(); } private: RWLock* lock_; - Status status_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 190a057d9e4..f05208c5ec9 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -43,13 +42,15 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK #else -// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one -// in _WIN32 platform -#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); -#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); +#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); #endif namespace paddle { @@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_WRITER_LOCK - kids_.push_back(new Scope(this)); - return *kids_.back(); + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; } Variable* Scope::Var(const std::string& name) { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } + SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_READER_LOCK + SCOPE_KIDS_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_READER_LOCK std::vector known_vars; - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it->second.release()); + vars_[new_name].reset(origin_it.value().release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index c140212c3e4..78ad8be500a 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,11 +14,15 @@ limitations under the License. */ #pragma once +#include #include +#include #include -#include +#include #include +#include // NOLINT + #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -94,7 +98,11 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map> vars_; + mutable tsl::robin_map< + std::string, std::unique_ptr, std::hash, + std::equal_to, + std::allocator>>, true> + vars_; private: // Call Scope::NewScope for a sub-scope. @@ -123,7 +131,8 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable RWLock rw_lock_; + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h new file mode 100644 index 00000000000..11a763d655a --- /dev/null +++ b/paddle/fluid/framework/spin_lock.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if !defined(_WIN32) +#include +#else +#include // NOLINT +#endif // !_WIN32 + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +#if !defined(_WIN32) +struct SpinLock { + SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); } + + ~SpinLock() { pthread_spin_destroy(&lock_); } + + void Lock() { + PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed"); + } + + void Unlock() { + PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0, + "release spin lock failed"); + } + + private: + pthread_spinlock_t lock_; +}; +#else +// FIXME(minqiyang): use mutex here to do fake spin lock +struct SpinLock { + void Lock() { mutex_.lock(); } + + void Unlock() { mutex_.lock(); } + + private: + std::mutex mutex_; +}; +#endif + +class AutoSpinLock { + public: + explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) { + lock_->Lock(); + } + + ~SpinLockGuard() { lock_->Unlock(); } + + private: + SpinLock* lock_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 2205f473f23..3455d1ee54e 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); - - auto& dev = - *ctx.template device_context().eigen_device(); - - const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); - auto eigen_in_beta1_pow = - framework::EigenVector::Flatten(*beta1_pow_ptr); - auto eigen_out_beta1_pow = framework::EigenVector::Flatten( - *(const_cast(beta1_pow_ptr))); - eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; - - const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); - auto eigen_in_beta2_pow = - framework::EigenVector::Flatten(*beta2_pow_ptr); - auto eigen_out_beta2_pow = framework::EigenVector::Flatten( - *(const_cast(beta2_pow_ptr))); - eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b23..f831f2313e4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 100. + because the temp variable's shape maybe the same between two iterations. Default 1. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1930ac106b2..da92826d410 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,27 +739,26 @@ class AdamOptimizer(Optimizer): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - # for param, grad in param_and_grads: - - # if grad is None: - # continue - # with param.block.program._optimized_guard( - # [param, grad]), name_scope("optimizer"): - # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - # param) - # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - # param) - # main_block.append_op( - # type="scale", - # inputs={"X": beta1_pow_acc}, - # outputs={"Out": beta1_pow_acc}, - # attrs={"scale": self._beta1}) - - # main_block.append_op( - # type="scale", - # inputs={"X": beta2_pow_acc}, - # outputs={"Out": beta2_pow_acc}, - # attrs={"scale": self._beta2}) + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope("optimizer"): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) + + main_block.append_op( + type="scale", + inputs={"X": beta2_pow_acc}, + outputs={"Out": beta2_pow_acc}, + attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): -- GitLab From a81495d6f4a71980b51cc3099f8cd76885cdcb13 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:45:20 +0800 Subject: [PATCH 0201/2367] Fix code --- paddle/fluid/framework/scope.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index f05208c5ec9..d2856a07a16 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -47,10 +48,10 @@ DEFINE_double( #define SCOPE_VARS_READER_LOCK #define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { -- GitLab From 19a798018f82b9eaa31aa8d84f8aa4306bbf8973 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:51:28 +0800 Subject: [PATCH 0202/2367] Remove dup cmake test=develop --- CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf724e8aa98..1b2e0ecf6c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,12 +81,6 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) -if (WITH_PROFILER) - find_package(Gperftools REQUIRED) - include_directories(${GPERFTOOLS_INCLUDE_DIR}) - add_definitions(-DWITH_GPERFTOOLS) -endif() - # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) -- GitLab From d839bd0dd4ffecaa061aed32684a1a0b09f28d30 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 19:14:35 +0800 Subject: [PATCH 0203/2367] simple commit --- paddle/fluid/framework/async_executor.h | 52 +++++++++++-------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 12642126411..a82e9415596 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -17,13 +17,13 @@ limitations under the License. */ #include #include #include -#include // NOLINT +#include // NOLINT +#include // local_random_engine #include #include #include // NOLINT #include #include -#include // local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -34,24 +34,23 @@ namespace paddle { namespace framework { inline double current_realtime() { - struct timespec tp; - clock_gettime(CLOCK_REALTIME, &tp); - return tp.tv_sec + tp.tv_nsec * 1e-9; + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; } inline std::default_random_engine& local_random_engine() { - struct engine_wrapper_t { - std::default_random_engine engine; - engine_wrapper_t() { - static std::atomic x(0); - std::seed_seq sseq = {x++, x++, x++, - static_cast( - current_realtime() * 1000)}; - engine.seed(sseq); - } - }; - thread_local engine_wrapper_t r; - return r.engine; + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; } class AsyncExecutor { @@ -63,14 +62,12 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const std::string& mode, - const bool debug = false); + const std::string& mode, const bool debug = false); #ifdef PADDLE_WITH_PSLIB void InitServer(const std::string& dist_desc, int index); - void InitWorker( - const std::string& dist_desc, - const std::vector& host_sign_list, - int node_num, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); uint64_t StartServer(); void StopServer(); void GatherServers(const std::vector& host_sign_list, int node_num); @@ -92,19 +89,16 @@ class AsyncExecutor { public: #ifdef PADDLE_WITH_PSLIB - std::shared_ptr _pslib_ptr; - std::shared_ptr _pull_dense_thread; + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; AsyncWorkerParamConfig _param_config; #endif Scope* root_scope_; platform::Place place_; - + private: int actual_thread_num; - }; - - } // namespace framework } // namespace paddle -- GitLab From f6c30863295de7d3a989c21d7d8e1427c888c301 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 19:44:46 +0800 Subject: [PATCH 0204/2367] add copy rigth checker to ps_pb2.py --- python/paddle/fluid/distributed/ps_pb2.py | 3490 +++++++++++++-------- 1 file changed, 2140 insertions(+), 1350 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 978b18d0d5e..0d226c4d593 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -1,8 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # Generated by the protocol buffer compiler. DO NOT EDIT! # source: ps.proto import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) from google.protobuf.internal import enum_type_wrapper from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message @@ -13,104 +25,115 @@ from google.protobuf import descriptor_pb2 _sym_db = _symbol_database.Default() - - - DESCRIPTOR = _descriptor.FileDescriptor( - name='ps.proto', - package='paddle', - syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') -) + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b( + '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' + )) _sym_db.RegisterFileDescriptor(DESCRIPTOR) _TABLETYPE = _descriptor.EnumDescriptor( - name='TableType', - full_name='paddle.TableType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='PS_SPARSE_TABLE', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_DENSE_TABLE', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3286, - serialized_end=3338, -) + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3286, + serialized_end=3338, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) _PSCMDID = _descriptor.EnumDescriptor( - name='PsCmdID', - full_name='paddle.PsCmdID', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='PS_PULL_DENSE_TABLE', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_DENSE_TABLE', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PULL_SPARSE_TABLE', index=2, number=2, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_SPARSE_TABLE', index=3, number=3, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SHRINK_TABLE', index=4, number=4, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SAVE_ONE_TABLE', index=5, number=5, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SAVE_ALL_TABLE', index=6, number=6, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_LOAD_ONE_TABLE', index=7, number=7, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_LOAD_ALL_TABLE', index=8, number=8, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_CLEAR_ONE_TABLE', index=9, number=9, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_CLEAR_ALL_TABLE', index=10, number=10, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_DENSE_PARAM', index=11, number=11, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_STOP_SERVER', index=12, number=12, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3341, - serialized_end=3658, -) + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', + index=0, + number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', + index=1, + number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', + index=2, + number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', + index=3, + number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', + index=5, + number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', + index=6, + number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', + index=7, + number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', + index=8, + number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', + index=9, + number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', + index=10, + number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', + index=11, + number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3341, + serialized_end=3658, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) @@ -130,1377 +153,2144 @@ PS_CLEAR_ALL_TABLE = 10 PS_PUSH_DENSE_PARAM = 11 PS_STOP_SERVER = 12 - _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( - name='FsApiType', - full_name='paddle.FsClientParameter.FsApiType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='HDFS', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='AFS', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3254, - serialized_end=3284, -) + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3254, + serialized_end=3284, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) - _PSPARAMETER = _descriptor.Descriptor( - name='PSParameter', - full_name='paddle.PSParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='worker_class', full_name='paddle.PSParameter.worker_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_class', full_name='paddle.PSParameter.server_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='instance_class', full_name='paddle.PSParameter.instance_class', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='worker_param', full_name='paddle.PSParameter.worker_param', index=3, - number=101, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_param', full_name='paddle.PSParameter.server_param', index=4, - number=102, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=5, - number=301, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=6, - number=501, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=21, - serialized_end=307, -) - + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', + full_name='paddle.PSParameter.worker_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.PSParameter.server_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', + full_name='paddle.PSParameter.instance_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', + full_name='paddle.PSParameter.worker_param', + index=3, + number=101, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', + full_name='paddle.PSParameter.server_param', + index=4, + number=102, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='trainer_param', + full_name='paddle.PSParameter.trainer_param', + index=5, + number=301, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', + full_name='paddle.PSParameter.fs_client_param', + index=6, + number=501, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=21, + serialized_end=307, ) _WORKERPARAMETER = _descriptor.Descriptor( - name='WorkerParameter', - full_name='paddle.WorkerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=309, - serialized_end=390, -) - + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', + full_name='paddle.WorkerParameter.downpour_worker_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=309, + serialized_end=390, ) _SERVERPARAMETER = _descriptor.Descriptor( - name='ServerParameter', - full_name='paddle.ServerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=392, - serialized_end=473, -) - + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', + full_name='paddle.ServerParameter.downpour_server_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=392, + serialized_end=473, ) _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( - name='DownpourWorkerParameter', - full_name='paddle.DownpourWorkerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=475, - serialized_end=554, -) - + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourWorkerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=475, + serialized_end=554, ) _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( - name='DownpourTrainerParameter', - full_name='paddle.DownpourTrainerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2, - number=3, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=557, - serialized_end=763, -) - + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', + full_name='paddle.DownpourTrainerParameter.dense_table', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', + full_name='paddle.DownpourTrainerParameter.sparse_table', + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_per_batch', + full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', + index=2, + number=3, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', + full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='skip_op', + full_name='paddle.DownpourTrainerParameter.skip_op', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=557, + serialized_end=763, ) _DENSETABLEPARAMETER = _descriptor.Descriptor( - name='DenseTableParameter', - full_name='paddle.DenseTableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1, - number=2, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=765, - serialized_end=888, -) - + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.DenseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', + full_name='paddle.DenseTableParameter.dense_variable_name', + index=1, + number=2, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', + full_name='paddle.DenseTableParameter.dense_gradient_variable_name', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.DenseTableParameter.fea_dim', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=765, + serialized_end=888, ) _SPARSETABLEPARAMETER = _descriptor.Descriptor( - name='SparseTableParameter', - full_name='paddle.SparseTableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1, - number=2, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=890, - serialized_end=1012, -) - + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.SparseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', + full_name='paddle.SparseTableParameter.feature_dim', + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', + full_name='paddle.SparseTableParameter.slot_key', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', + full_name='paddle.SparseTableParameter.slot_value', + index=3, + number=4, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', + full_name='paddle.SparseTableParameter.slot_gradient', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=890, + serialized_end=1012, ) _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( - name='DownpourServerParameter', - full_name='paddle.DownpourServerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1015, - serialized_end=1149, -) - + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourServerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', + full_name='paddle.DownpourServerParameter.service_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1015, + serialized_end=1149, ) _SERVERSERVICEPARAMETER = _descriptor.Descriptor( - name='ServerServiceParameter', - full_name='paddle.ServerServiceParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, - number=4, type=13, cpp_type=3, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, - number=5, type=13, cpp_type=3, label=1, - has_default_value=True, default_value=12, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1152, - serialized_end=1367, -) - + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.ServerServiceParameter.server_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsServer").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', + full_name='paddle.ServerServiceParameter.client_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsClient").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', + full_name='paddle.ServerServiceParameter.service_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourPsService").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', + full_name='paddle.ServerServiceParameter.start_server_port', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', + full_name='paddle.ServerServiceParameter.server_thread_num', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=12, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1152, + serialized_end=1367, ) _TABLEPARAMETER = _descriptor.Descriptor( - name='TableParameter', - full_name='paddle.TableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.TableParameter.table_id', index=0, - number=1, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_class', full_name='paddle.TableParameter.table_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='shared_num', full_name='paddle.TableParameter.shared_num', index=2, - number=3, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='accessor', full_name='paddle.TableParameter.accessor', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='type', full_name='paddle.TableParameter.type', index=4, - number=5, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5, - number=6, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1370, - serialized_end=1561, -) - + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.TableParameter.table_id', + index=0, + number=1, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', + full_name='paddle.TableParameter.table_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', + full_name='paddle.TableParameter.shared_num', + index=2, + number=3, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', + full_name='paddle.TableParameter.accessor', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', + full_name='paddle.TableParameter.type', + index=4, + number=5, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', + full_name='paddle.TableParameter.compress_in_save', + index=5, + number=6, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1370, + serialized_end=1561, ) _TABLEACCESSORPARAMETER = _descriptor.Descriptor( - name='TableAccessorParameter', - full_name='paddle.TableAccessorParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3, - number=4, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4, - number=5, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5, - number=6, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7, - number=8, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1564, - serialized_end=1933, -) - + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', + full_name='paddle.TableAccessorParameter.accessor_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', + full_name='paddle.TableAccessorParameter.sparse_sgd_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', + full_name='paddle.TableAccessorParameter.dense_sgd_param', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.TableAccessorParameter.fea_dim', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', + full_name='paddle.TableAccessorParameter.embedx_dim', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', + full_name='paddle.TableAccessorParameter.embedx_threshold', + index=5, + number=6, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', + full_name='paddle.TableAccessorParameter.downpour_accessor_param', + index=6, + number=7, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', + full_name='paddle.TableAccessorParameter.table_accessor_save_param', + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1564, + serialized_end=1933, ) _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( - name='DownpourTableAccessorParameter', - full_name='paddle.DownpourTableAccessorParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0, - number=1, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1, - number=2, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2, - number=3, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3, - number=4, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4, - number=5, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5, - number=6, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6, - number=7, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1936, - serialized_end=2142, -) - + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', + full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', + index=0, + number=1, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', + full_name='paddle.DownpourTableAccessorParameter.click_coeff', + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', + full_name='paddle.DownpourTableAccessorParameter.base_threshold', + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', + full_name='paddle.DownpourTableAccessorParameter.delta_threshold', + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', + full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', + index=4, + number=5, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', + full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', + index=5, + number=6, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', + full_name='paddle.DownpourTableAccessorParameter.delete_threshold', + index=6, + number=7, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1936, + serialized_end=2142, ) _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( - name='TableAccessorSaveParameter', - full_name='paddle.TableAccessorSaveParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0, - number=1, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2144, - serialized_end=2227, -) - + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', + full_name='paddle.TableAccessorSaveParameter.param', + index=0, + number=1, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', + full_name='paddle.TableAccessorSaveParameter.converter', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', + full_name='paddle.TableAccessorSaveParameter.deconverter', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2144, + serialized_end=2227, ) _PSREQUESTMESSAGE = _descriptor.Descriptor( - name='PsRequestMessage', - full_name='paddle.PsRequestMessage', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0, - number=1, type=13, cpp_type=3, label=2, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1, - number=2, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='params', full_name='paddle.PsRequestMessage.params', index=2, - number=3, type=12, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data', full_name='paddle.PsRequestMessage.data', index=4, - number=5, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2229, - serialized_end=2330, -) - + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', + full_name='paddle.PsRequestMessage.cmd_id', + index=0, + number=1, + type=13, + cpp_type=3, + label=2, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.PsRequestMessage.table_id', + index=1, + number=2, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', + full_name='paddle.PsRequestMessage.params', + index=2, + number=3, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', + full_name='paddle.PsRequestMessage.client_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsRequestMessage.data', + index=4, + number=5, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2229, + serialized_end=2330, ) _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( - name='SparseSGDRuleParameter', - full_name='paddle.SparseSGDRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2, - number=3, type=1, cpp_type=5, label=1, - has_default_value=True, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3, - number=4, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2332, - serialized_end=2451, -) - + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.SparseSGDRuleParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', + full_name='paddle.SparseSGDRuleParameter.initial_g2sum', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', + full_name='paddle.SparseSGDRuleParameter.initial_range', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', + full_name='paddle.SparseSGDRuleParameter.weight_bounds', + index=3, + number=4, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2332, + serialized_end=2451, ) _DENSESGDRULEPARAMETER = _descriptor.Descriptor( - name='DenseSGDRuleParameter', - full_name='paddle.DenseSGDRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2454, - serialized_end=2679, -) - + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', + full_name='paddle.DenseSGDRuleParameter.name', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', + full_name='paddle.DenseSGDRuleParameter.adam', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', + full_name='paddle.DenseSGDRuleParameter.naive', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', + full_name='paddle.DenseSGDRuleParameter.summary', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', + full_name='paddle.DenseSGDRuleParameter.moving_average', + index=4, + number=5, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2454, + serialized_end=2679, ) _ADAMSGDPARAMETER = _descriptor.Descriptor( - name='AdamSGDParameter', - full_name='paddle.AdamSGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2, - number=3, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3, - number=4, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4, - number=5, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2682, - serialized_end=2816, -) - + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.AdamSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.AdamSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', + full_name='paddle.AdamSGDParameter.ada_decay_rate', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', + full_name='paddle.AdamSGDParameter.ada_epsilon', + index=3, + number=4, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', + full_name='paddle.AdamSGDParameter.mom_decay_rate', + index=4, + number=5, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2682, + serialized_end=2816, ) _NAIVESGDPARAMETER = _descriptor.Descriptor( - name='NaiveSGDParameter', - full_name='paddle.NaiveSGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2818, - serialized_end=2884, -) - + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.NaiveSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.NaiveSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2818, + serialized_end=2884, ) _SUMMARYSGDPARAMETER = _descriptor.Descriptor( - name='SummarySGDParameter', - full_name='paddle.SummarySGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=True, default_value=float(0.999999), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2886, - serialized_end=2945, -) - + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', + full_name='paddle.SummarySGDParameter.summary_decay_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0.999999), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2886, + serialized_end=2945, ) _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( - name='MovingAverageRuleParameter', - full_name='paddle.MovingAverageRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2947, - serialized_end=2993, -) - + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', + full_name='paddle.MovingAverageRuleParameter.momentum', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2947, + serialized_end=2993, ) _PSRESPONSEMESSAGE = _descriptor.Descriptor( - name='PsResponseMessage', - full_name='paddle.PsResponseMessage', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0, - number=1, type=5, cpp_type=1, label=2, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1, - number=2, type=9, cpp_type=9, label=2, - has_default_value=True, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data', full_name='paddle.PsResponseMessage.data', index=2, - number=3, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2995, - serialized_end=3068, -) - + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', + full_name='paddle.PsResponseMessage.err_code', + index=0, + number=1, + type=5, + cpp_type=1, + label=2, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', + full_name='paddle.PsResponseMessage.err_msg', + index=1, + number=2, + type=9, + cpp_type=9, + label=2, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsResponseMessage.data', + index=2, + number=3, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2995, + serialized_end=3068, ) _FSCLIENTPARAMETER = _descriptor.Descriptor( - name='FsClientParameter', - full_name='paddle.FsClientParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0, - number=1, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uri', full_name='paddle.FsClientParameter.uri', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='user', full_name='paddle.FsClientParameter.user', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='passwd', full_name='paddle.FsClientParameter.passwd', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4, - number=5, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5, - number=51, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6, - number=101, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - _FSCLIENTPARAMETER_FSAPITYPE, - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=3071, - serialized_end=3284, -) + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', + full_name='paddle.FsClientParameter.fs_type', + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', + full_name='paddle.FsClientParameter.uri', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', + full_name='paddle.FsClientParameter.user', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', + full_name='paddle.FsClientParameter.passwd', + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', + full_name='paddle.FsClientParameter.buffer_size', + index=4, + number=5, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', + full_name='paddle.FsClientParameter.hadoop_bin', + index=5, + number=51, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', + full_name='paddle.FsClientParameter.afs_conf', + index=6, + number=101, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3071, + serialized_end=3284, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER -_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER +_PSPARAMETER.fields_by_name[ + 'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER _PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER -_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER -_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER -_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER -_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER -_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER -_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER -_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER -_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER +_WORKERPARAMETER.fields_by_name[ + 'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name[ + 'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name[ + 'accessor'].message_type = _TABLEACCESSORPARAMETER _TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE -_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER _DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER _DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER -_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER -_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER -_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_DENSESGDRULEPARAMETER.fields_by_name[ + 'summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name[ + 'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE _FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER -DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER -DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER -DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER -DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'ServerServiceParameter'] = _SERVERSERVICEPARAMETER DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER -DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER -DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER -DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE -DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER -DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER -DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID -PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict( - DESCRIPTOR = _PSPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PSParameter) - )) +PSParameter = _reflection.GeneratedProtocolMessageType( + 'PSParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_PSPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) _sym_db.RegisterMessage(PSParameter) -WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict( - DESCRIPTOR = _WORKERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) - )) +WorkerParameter = _reflection.GeneratedProtocolMessageType( + 'WorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_WORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) _sym_db.RegisterMessage(WorkerParameter) -ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict( - DESCRIPTOR = _SERVERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.ServerParameter) - )) +ServerParameter = _reflection.GeneratedProtocolMessageType( + 'ServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) _sym_db.RegisterMessage(ServerParameter) -DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURWORKERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) - )) +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourWorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURWORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) _sym_db.RegisterMessage(DownpourWorkerParameter) -DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURTRAINERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) - )) +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTrainerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTRAINERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) _sym_db.RegisterMessage(DownpourTrainerParameter) -DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict( - DESCRIPTOR = _DENSETABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) - )) +DenseTableParameter = _reflection.GeneratedProtocolMessageType( + 'DenseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) _sym_db.RegisterMessage(DenseTableParameter) -SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict( - DESCRIPTOR = _SPARSETABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) - )) +SparseTableParameter = _reflection.GeneratedProtocolMessageType( + 'SparseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) _sym_db.RegisterMessage(SparseTableParameter) -DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURSERVERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) - )) +DownpourServerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURSERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) _sym_db.RegisterMessage(DownpourServerParameter) -ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict( - DESCRIPTOR = _SERVERSERVICEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) - )) +ServerServiceParameter = _reflection.GeneratedProtocolMessageType( + 'ServerServiceParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERSERVICEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) _sym_db.RegisterMessage(ServerServiceParameter) -TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableParameter) - )) +TableParameter = _reflection.GeneratedProtocolMessageType( + 'TableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) _sym_db.RegisterMessage(TableParameter) -TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEACCESSORPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) - )) +TableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) _sym_db.RegisterMessage(TableAccessorParameter) -DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) - )) +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) _sym_db.RegisterMessage(DownpourTableAccessorParameter) -TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) - )) +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorSaveParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) _sym_db.RegisterMessage(TableAccessorSaveParameter) -PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict( - DESCRIPTOR = _PSREQUESTMESSAGE, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) - )) +PsRequestMessage = _reflection.GeneratedProtocolMessageType( + 'PsRequestMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSREQUESTMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) _sym_db.RegisterMessage(PsRequestMessage) -SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _SPARSESGDRULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) - )) +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'SparseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) _sym_db.RegisterMessage(SparseSGDRuleParameter) -DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _DENSESGDRULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) - )) +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'DenseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) _sym_db.RegisterMessage(DenseSGDRuleParameter) -AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict( - DESCRIPTOR = _ADAMSGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) - )) +AdamSGDParameter = _reflection.GeneratedProtocolMessageType( + 'AdamSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_ADAMSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) _sym_db.RegisterMessage(AdamSGDParameter) -NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict( - DESCRIPTOR = _NAIVESGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) - )) +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType( + 'NaiveSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_NAIVESGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) _sym_db.RegisterMessage(NaiveSGDParameter) -SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict( - DESCRIPTOR = _SUMMARYSGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) - )) +SummarySGDParameter = _reflection.GeneratedProtocolMessageType( + 'SummarySGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SUMMARYSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) _sym_db.RegisterMessage(SummarySGDParameter) -MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) - )) +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType( + 'MovingAverageRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) _sym_db.RegisterMessage(MovingAverageRuleParameter) -PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict( - DESCRIPTOR = _PSRESPONSEMESSAGE, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) - )) +PsResponseMessage = _reflection.GeneratedProtocolMessageType( + 'PsResponseMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSRESPONSEMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) _sym_db.RegisterMessage(PsResponseMessage) -FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict( - DESCRIPTOR = _FSCLIENTPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) - )) +FsClientParameter = _reflection.GeneratedProtocolMessageType( + 'FsClientParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_FSCLIENTPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) _sym_db.RegisterMessage(FsClientParameter) - DESCRIPTOR.has_options = True -DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001')) +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), + _b('\200\001\001')) # @@protoc_insertion_point(module_scope) -- GitLab From 4f304eaa6fcdc5af93a4878a09387f4d7fbd5aed Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 19:46:35 +0800 Subject: [PATCH 0205/2367] fix unittest test=develop --- paddle/fluid/framework/parallel_executor.cc | 14 +++++++++++--- .../unittests/test_parallel_executor_mnist.py | 2 ++ .../test_parallel_executor_transformer.py | 4 ---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81d1024cb65..2604e41045b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -197,9 +197,17 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); - PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, - "You should set build_strategy.reduce with 'AllReduce' for " - "the ParallelGraph executor type"); + } + + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + PADDLE_ENFORCE( + member_->use_all_reduce_, + "build_strategy.reduce should be `AllReduce` if you want to use" + "ParallelGraph executor."); + PADDLE_ENFORCE( + member_->use_cuda_, + "execution_strategy.use_cuda should be True if you want to use" + "ParallelGraph executor."); } // Step 1. Bcast the params to devs. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3dddff0d99d..0ff079b4e2c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -166,6 +166,8 @@ class TestMNIST(TestParallelExecutorBase): def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return + if not use_cuda and exec_type == ExecutorType.ParallelGraph: + return img, label = self._init_data() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index b5ee72a24e6..8a1a3ab3cae 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -173,10 +173,6 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) - self.check_network_convergence( - transformer, - use_cuda=True, - exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) -- GitLab From e52bb816e36b5df53c1608f3aada655b21d11ab5 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 20:16:47 +0800 Subject: [PATCH 0206/2367] add copyright to __init__.py in distributed folder --- paddle/fluid/pybind/async_executor_py.cc | 16 ++++++++-------- python/paddle/fluid/distributed/__init__.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 71a0e256e43..222c128c66f 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -49,13 +49,13 @@ void BindAsyncExecutor(py::module* m) { new framework::AsyncExecutor(scope, place)); })) .def("run_from_files", &framework::AsyncExecutor::RunFromFile) - .def("init_server", &framework::AsyncExecutor::InitServer) - .def("init_worker", &framework::AsyncExecutor::InitWorker) - .def("start_server", &framework::AsyncExecutor::StartServer) - .def("stop_server", &framework::AsyncExecutor::StopServer) - .def("gather_servers", &framework::AsyncExecutor::GatherServers) - .def("init_model", &framework::AsyncExecutor::InitModel) - .def("save_model", &framework::AsyncExecutor::SaveModel); + .def("init_server", &framework::AsyncExecutor::InitServer) + .def("init_worker", &framework::AsyncExecutor::InitWorker) + .def("start_server", &framework::AsyncExecutor::StartServer) + .def("stop_server", &framework::AsyncExecutor::StopServer) + .def("gather_servers", &framework::AsyncExecutor::GatherServers) + .def("init_model", &framework::AsyncExecutor::InitModel) + .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor #else void BindAsyncExecutor(py::module* m) { @@ -64,7 +64,7 @@ void BindAsyncExecutor(py::module* m) { return std::unique_ptr( new framework::AsyncExecutor(scope, place)); })) - .def("run_from_files", &framework::AsyncExecutor::RunFromFile) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile); } // end BindAsyncExecutor #endif } // end namespace pybind diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py index e69de29bb2d..cd609c50407 100644 --- a/python/paddle/fluid/distributed/__init__.py +++ b/python/paddle/fluid/distributed/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and -- GitLab From bf9302f95015db6cadf3e814cfc4f21ef8434a3d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 10:18:22 +0000 Subject: [PATCH 0207/2367] add lstm, peephole refer and test --- paddle/fluid/operators/jit/gen_base.cc | 5 - paddle/fluid/operators/jit/gen_base.h | 4 - paddle/fluid/operators/jit/helper.cc | 20 +++ paddle/fluid/operators/jit/helper.h | 4 +- paddle/fluid/operators/jit/kernel_base.h | 54 ++++++- paddle/fluid/operators/jit/kernel_key.cc | 38 +++++ paddle/fluid/operators/jit/kernel_key.h | 4 + .../fluid/operators/jit/refer/CMakeLists.txt | 2 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 89 ++++++++++++ paddle/fluid/operators/jit/test.cc | 137 ++++++++++++++++++ paddle/fluid/operators/math/jit_kernel_impl.h | 39 ----- .../fluid/operators/math/jit_kernel_refer.h | 85 ----------- 13 files changed, 346 insertions(+), 138 deletions(-) create mode 100644 paddle/fluid/operators/jit/kernel_key.cc diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index a8bf9029637..310da0c76f1 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -23,11 +23,6 @@ namespace paddle { namespace operators { namespace jit { -template <> -size_t JitCodeKey(int d) { - return d; -} - // refer do not need useme, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 586f4389c04..48855abd267 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -43,10 +43,6 @@ class GenBase : public Kernel { void dumpCode(const unsigned char* code) const; }; -// Every JitCode should have a method to get the key from attribution -template -size_t JitCodeKey(Attr attr); - // Creator is used to creat the jitcode and save in pool. // Every JitCode should have one creator. class GenCreator { diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index c010b64c9cb..d6fa4891e38 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/helper.h" +#include // tolower #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -36,6 +37,8 @@ const char* to_string(KernelType kt) { ONE_CASE(vexp); ONE_CASE(vsigmoid); ONE_CASE(vtanh); + ONE_CASE(lstmctht); + ONE_CASE(lstmc1h1); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; @@ -44,6 +47,23 @@ const char* to_string(KernelType kt) { } #undef ONE_CASE +KernelType to_kerneltype(const std::string& act) { + std::string lower = act; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + if (lower == "relu" || lower == "vrelu") { + return vrelu; + } else if (lower == "identity" || lower == "videntity" || lower == "") { + return videntity; + } else if (lower == "exp" || lower == "vexp") { + return vexp; + } else if (lower == "sigmoid" || lower == "vsigmoid") { + return vsigmoid; + } else if (lower == "tanh" || lower == "vtanh") { + return vtanh; + } + return non_kernel; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 053e5ed0798..302e70caa7e 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,9 +14,7 @@ #pragma once -#include // for unique_ptr #include -#include #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -124,6 +122,8 @@ typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { const char* to_string(KernelType kt); +KernelType to_kerneltype(const std::string& act); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 29b881b7540..3ab0194ce2b 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -20,8 +20,9 @@ namespace operators { namespace jit { typedef enum { - vmul = 0, - vadd = 1, + non_kernel = 0, + vmul = 1, + vadd = 2, vaddrelu, vsub, vscal, @@ -30,7 +31,9 @@ typedef enum { videntity, vexp, vsigmoid, - vtanh + vtanh, + lstmctht, + lstmc1h1 } KernelType; template @@ -50,6 +53,51 @@ struct XYNTuples { typedef void (*func_type)(const T*, T*, int); }; +typedef struct { + void* gates; // gates: x_ch, x_ih, x_fh, x_oh + const void* ct_1; + void* ct; + void* ht; + /* weight_peephole and checked data are only used in peephole*/ + const void* wp{nullptr}; // W_ic, W_fc, W_oc + void* checked{nullptr}; // size: 2 * d +} lstm_t; + +typedef struct { + void* gates; // gates: {x_update, x_reset; x_state} + const void* ht_1; + void* ht; +} gru_t; + +struct rnn_attr_s { + int d; + KernelType act_gate, act_cand; + rnn_attr_s() = default; + rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand) + : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} +}; + +struct lstm_attr_s : public rnn_attr_s { + bool use_peephole; + KernelType act_cell; + lstm_attr_s() = default; + lstm_attr_s(int _d, KernelType _act_gate, KernelType _act_cand, + KernelType _act_cell, bool _use_peephole = false) + : rnn_attr_s(_d, _act_gate, _act_cand), + use_peephole(_use_peephole), + act_cell(_act_cell) {} +}; + +typedef struct rnn_attr_s gru_attr_t; +typedef struct lstm_attr_s lstm_attr_t; + +template +struct LSTMTuples { + typedef T data_type; + typedef lstm_attr_t attr_type; + typedef void (*func_type)(lstm_t*, const lstm_attr_t*); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc new file mode 100644 index 00000000000..7a9ae81f89f --- /dev/null +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/kernel_key.h" + +namespace paddle { +namespace operators { +namespace jit { + +template <> +size_t JitCodeKey(const int& d) { + return d; +} + +template <> +size_t JitCodeKey(const lstm_attr_t& attr) { + constexpr int act_type_shift = 3; // suppot 2^3 act types + size_t key = attr.d; + int gate_key = static_cast(attr.act_gate) << 1; + int cand_key = static_cast(attr.act_cand) << (1 + act_type_shift); + int cell_key = static_cast(attr.act_cell) << (1 + act_type_shift * 2); + return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + + attr.use_peephole; +} +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index af9df77337d..611a0210d61 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -44,6 +44,10 @@ struct KernelKey { bool operator!=(const KernelKey& o) const { return !(*this == o); } }; +// Every JitCode should have a method to get the key from attribution +template +size_t JitCodeKey(const Attr& attr); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index dc07ddb914b..e30923c4fd7 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -18,3 +18,5 @@ USE_JITKERNEL_REFER(videntity) USE_JITKERNEL_REFER(vexp) USE_JITKERNEL_REFER(vsigmoid) USE_JITKERNEL_REFER(vtanh) +USE_JITKERNEL_REFER(lstmctht) +USE_JITKERNEL_REFER(lstmc1h1) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index f716ca89c58..59b3ce52486 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -35,4 +35,7 @@ REGISTER_REFER_KERNEL(vexp, VExp); REGISTER_REFER_KERNEL(vsigmoid, VSigmoid); REGISTER_REFER_KERNEL(vtanh, VTanh); +REGISTER_REFER_KERNEL(lstmctht, LSTMCtHt); +REGISTER_REFER_KERNEL(lstmc1h1, LSTMC1H1); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 7ef60a2d539..a93123df9d8 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -110,6 +110,91 @@ void VTanh(const T* x, T* y, int n) { } } +template +void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT + if (type == vsigmoid) { + return VSigmoid; + } else if (type == vrelu) { + return VRelu; + } else if (type == vtanh) { + return VTanh; + } else if (type == videntity) { + return VIdentity; + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +// compute ct and ht +template +void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + const T* wp = reinterpret_cast(step->wp); + T* checked = reinterpret_cast(step->checked); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + // gates: W_ch, W_ih, W_fh, W_oh + if (attr->use_peephole) { + VMul(wp, ct_1, checked, d); + VMul(wp + d, ct_1, checked + d, d); + VAdd(checked, gates + d, gates + d, d2); + act_gate(gates + d, gates + d, d2); + } else { + act_gate(gates + d, gates + d, d3); + } + + // C_t = C_t-1 * fgated + cand_gated * igated + act_cand(gates, gates, d); + VMul(gates, gates + d, gates + d, d); + VMul(ct_1, gates + d2, gates + d2, d); + VAdd(gates + d, gates + d2, ct, d); + + if (attr->use_peephole) { + // get ogated + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + act_gate(gates + d3, gates + d3, d); + } + // H_t = act_cell(C_t) * ogated + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +// compute c1 and h1 without c0 or h0 +template +void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + /* C_t = igated * cgated*/ + act_gate(gates + d, gates + d, d); + act_cand(gates, gates, d); + VMul(gates, gates + d, ct, d); + if (attr->use_peephole) { + // get outgated, put W_oc * C_t on igated + const T* wp = reinterpret_cast(step->wp); + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + } + /* H_t = act_cell(C_t) * ogated */ + act_gate(gates + d3, gates + d3, d); + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -134,6 +219,10 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VTanh, XYNTuples); +// lstm_t* , const lstm_attr_t* +DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 4c9b853b6e6..03e56416b2f 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -350,6 +350,143 @@ TEST(JITKernel, vtanh) { TestXYNKernel(); } +template +void TestLSTMFunc(const typename KernelTuples::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const paddle::operators::jit::lstm_attr_t& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + paddle::operators::jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); +} + +template +void TestLSTMKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; + for (int d : TestSizes()) { + for (bool use_peephole : {true, false}) { + for (auto& act_gate : all_acts) { + for (auto& act_cand : all_acts) { + for (auto& act_cell : all_acts) { + std::string info = act_gate + act_cand + act_cell + + (use_peephole ? "peephole_" : "") + "size_" + + std::to_string(d); + const jit::lstm_attr_t attr( + d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), + jit::to_kerneltype(act_cell), use_peephole); + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector xsrc(4 * d), wp(3 * d), ct_1(d); + std::vector ct_ref(d), ht_ref(d), checked(2 * d); + RandomVec(4 * d, xsrc.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + // x could be changed after compute, so copy to save src + std::vector x(xsrc.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.data(); + T* checked_data = checked.data(); + T* ct_ref_data = ct_ref.data(); + T* ht_ref_data = ht_ref.data(); + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_ref_data; + step.ht = ht_ref_data; + if (use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + ref(&step, &attr); + + // test jitcode + auto jitcode = + jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel " << info; + TestLSTMFunc>(jitcode, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel " << info; + TestLSTMFunc>(more, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + TestLSTMFunc>(tgt, xsrc, wp, ct_1, ct_ref, + ht_ref, attr); + } + } + } + } + } +} + +TEST(JITKernel, lstmctht) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +TEST(JITKernel, lstmc1h1) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +// TODO(TJ): refine the tests template + TEST(JITKernel, pool) { // TODO(TJ): add some test } diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index ba5f20e5338..025343dfad4 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -28,45 +28,6 @@ namespace jitkernel { #define YMM_FLOAT_BLOCK 8 #define ZMM_FLOAT_BLOCK 16 -typedef struct { - void* gates; // gates: W_ch, W_ih, W_fh, W_oh - const void* ct_1; - void* ct; - void* ht; - /* weight_peephole and checked data are only used in peephole*/ - const void* wp{nullptr}; - void* checked{nullptr}; -} lstm_t; - -typedef struct { - void* gates; // gates: {W_update, W_reset; W_state} - const void* ht_1; - void* ht; -} gru_t; - -struct rnn_attr_s { - int d; - std::string act_gate, act_cand; - rnn_attr_s() = default; - rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand) - : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} -}; - -struct lstm_attr_s : public rnn_attr_s { - bool use_peephole; - std::string act_cell; - lstm_attr_s() = default; - lstm_attr_s(int _d, const std::string& _act_gate, - const std::string& _act_cand, const std::string& _act_cell, - bool _use_peephole = false) - : rnn_attr_s(_d, _act_gate, _act_cand), - use_peephole(_use_peephole), - act_cell(_act_cell) {} -}; - -typedef struct rnn_attr_s gru_attr_t; -typedef struct lstm_attr_s lstm_attr_t; - } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index a03e851de56..122cbcb0d6f 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,91 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT - if (type == "sigmoid") { - return VSigmoid; - } else if (type == "relu") { - return VRelu; - } else if (type == "tanh") { - return VTanh; - } else if (type == "identity" || type == "") { - return VIdentity; - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} - -// compute ct and ht -template -void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - const T* wp = reinterpret_cast(step->wp); - T* checked = reinterpret_cast(step->checked); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - // gates: W_ch, W_ih, W_fh, W_oh - if (attr->use_peephole) { - VMul(wp, ct_1, checked, d); - VMul(wp + d, ct_1, checked + d, d); - VAdd(checked, gates + d, gates + d, d2); - act_gate(gates + d, gates + d, d2); - } else { - act_gate(gates + d, gates + d, d3); - } - - // C_t = C_t-1 * fgated + cand_gated * igated - act_cand(gates, gates, d); - VMul(gates, gates + d, gates + d, d); - VMul(ct_1, gates + d2, gates + d2, d); - VAdd(gates + d, gates + d2, ct, d); - - if (attr->use_peephole) { - // get ogated - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - act_gate(gates + d3, gates + d3, d); - } - // H_t = act_cell(C_t) * ogated - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - -// compute c1 and h1 without c0 or h0 -template -void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - /* C_t = igated * cgated*/ - act_gate(gates + d, gates + d, d); - act_cand(gates, gates, d); - VMul(gates, gates + d, ct, d); - if (attr->use_peephole) { - // get outgated, put W_oc * C_t on igated - const T* wp = reinterpret_cast(step->wp); - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - } - /* H_t = act_cell(C_t) * ogated */ - act_gate(gates + d3, gates + d3, d); - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - // compute h1 without h0 template void GRUH1(gru_t* step, const gru_attr_t* attr) { -- GitLab From e2130502234f042a6381939f80c640bbebe2e1c6 Mon Sep 17 00:00:00 2001 From: Wang Guibao Date: Thu, 13 Dec 2018 20:49:50 +0800 Subject: [PATCH 0208/2367] Fix multi-threading bug with WItH_MKL=ON (#14882) fixes #14884 --- paddle/fluid/framework/executor_thread_worker.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 3d535116155..5fc5aeb662a 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" namespace paddle { @@ -174,6 +175,8 @@ void print_fetch_var(Scope* scope, std::string var_name) { } void ExecutorThreadWorker::TrainFiles() { + platform::SetNumThreads(1); + // todo: configurable SetDevice(); -- GitLab From 6eec46172560e9b217eab8c33d9b45aaa3ac5f1b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 13:04:29 +0000 Subject: [PATCH 0209/2367] add lstm peephole benchmark --- paddle/fluid/operators/jit/benchmark.cc | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index a7e5eb6cf4a..01467e324cc 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -272,6 +272,98 @@ void BenchXYNKernel() { } } +// return this function avg time +template +double BenchLSTMFunc(const typename KernelTuples::func_type tgt, + const paddle::operators::jit::lstm_attr_t* attr, + paddle::operators::jit::lstm_t* step) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(step, attr); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(step, attr); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchLSTMKernel() { + namespace jit = paddle::operators::jit; + for (bool use_peephole : {true, false}) { + for (int d : TestSizes()) { + const jit::lstm_attr_t attr(d, jit::vsigmoid, jit::vtanh, jit::vtanh, + use_peephole); + std::vector> infos; + std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); + RandomVec(4 * d, x.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.data(); + T* checked_data = checked.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchLSTMFunc>(refer, &attr, &step); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + auto res = BenchLSTMFunc>(jitcode, &attr, &step); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + auto res = BenchLSTMFunc>(more, &attr, &step); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchLSTMFunc>(tgt, &attr, &step); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) + << ", Sigmoid,Tanh,Tanh, " << (use_peephole ? "Peephole_" : "") + << " size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -294,9 +386,14 @@ int main(int argc, char* argv[]) { BenchAXYNKernel(); BenchAXYNKernel(); + // act BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); + + // lstm and peephole + BenchLSTMKernel(); + BenchLSTMKernel(); } -- GitLab From 3dc29b390537cca68f43f21f44c2c2fde84fa297 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 22:02:55 +0800 Subject: [PATCH 0210/2367] change sparse_update to adam_update --- paddle/fluid/operators/optimizers/adam_op.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index ca5454ef040..25e23c5f9d4 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -202,7 +202,7 @@ struct SparseAdamFunctor { row_count_(row_count), sparse_mode_(sparse_mode) {} - inline HOSTDEVICE void sparse_update(size_t i, T g) const { + inline HOSTDEVICE void adam_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -228,7 +228,7 @@ struct SparseAdamFunctor { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; - sparse_update(i, g); + adam_update(i, g); } }; @@ -364,7 +364,7 @@ class AdamOpKernel : public framework::OpKernel { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = rows[row_index] * row_numel + offset; T g = grad_data[row_index * row_numel + offset]; - functor.sparse_update(i, g); + functor.adam_update(i, g); } } } else { -- GitLab From 00d3afbcc9959cae839b134e0c0743a335b9f2a0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 14:12:20 +0000 Subject: [PATCH 0211/2367] add gru refer functions, test and benchmark --- paddle/fluid/operators/jit/README.md | 4 +- paddle/fluid/operators/jit/benchmark.cc | 84 +++++++++++++++ paddle/fluid/operators/jit/helper.cc | 3 + paddle/fluid/operators/jit/kernel_base.h | 12 ++- paddle/fluid/operators/jit/kernel_key.cc | 11 +- .../fluid/operators/jit/refer/CMakeLists.txt | 3 + paddle/fluid/operators/jit/refer/refer.cc | 4 + paddle/fluid/operators/jit/refer/refer.h | 54 +++++++++- paddle/fluid/operators/jit/test.cc | 102 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_refer.h | 49 +-------- 10 files changed, 274 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 2d72aa4d569..6b2f2b2848e 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -45,4 +45,6 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. -- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`. +- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 +- 添加unit test,需要测试float和double +- 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 01467e324cc..ca636b020c2 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -364,6 +364,85 @@ void BenchLSTMKernel() { } } +// return this function avg time +template +double BenchGRUFunc(const typename KernelTuples::func_type tgt, + const paddle::operators::jit::gru_attr_t* attr, + paddle::operators::jit::gru_t* step) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(step, attr); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(step, attr); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchGRUKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + const jit::gru_attr_t attr(d, jit::vsigmoid, jit::vtanh); + std::vector> infos; + std::vector x(3 * d), ht_1(d), ht(d); + RandomVec(3 * d, x.data(), -2.f, 2.f); + RandomVec(d, ht_1.data(), -2.f, 2.f); + const T* ht_1_data = ht_1.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchGRUFunc>(refer, &attr, &step); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + auto res = BenchGRUFunc>(jitcode, &attr, &step); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + auto res = BenchGRUFunc>(more, &attr, &step); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchGRUFunc>(tgt, &attr, &step); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", Sigmoid,Tanh, size " + << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -396,4 +475,9 @@ int main(int argc, char* argv[]) { // lstm and peephole BenchLSTMKernel(); BenchLSTMKernel(); + + // gru functions + BenchGRUKernel(); + BenchGRUKernel(); + BenchGRUKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index d6fa4891e38..0543b0743c0 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -39,6 +39,9 @@ const char* to_string(KernelType kt) { ONE_CASE(vtanh); ONE_CASE(lstmctht); ONE_CASE(lstmc1h1); + ONE_CASE(gruh1); + ONE_CASE(gruhtpart1); + ONE_CASE(gruhtpart2); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 3ab0194ce2b..00d583c60bf 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -33,7 +33,10 @@ typedef enum { vsigmoid, vtanh, lstmctht, - lstmc1h1 + lstmc1h1, + gruh1, + gruhtpart1, + gruhtpart2 } KernelType; template @@ -98,6 +101,13 @@ struct LSTMTuples { typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; +template +struct GRUTuples { + typedef T data_type; + typedef gru_attr_t attr_type; + typedef void (*func_type)(gru_t*, const gru_attr_t*); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 7a9ae81f89f..4e6a19f04fd 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -23,9 +23,10 @@ size_t JitCodeKey(const int& d) { return d; } +constexpr int act_type_shift = 3; // suppot 2^3 act types + template <> size_t JitCodeKey(const lstm_attr_t& attr) { - constexpr int act_type_shift = 3; // suppot 2^3 act types size_t key = attr.d; int gate_key = static_cast(attr.act_gate) << 1; int cand_key = static_cast(attr.act_cand) << (1 + act_type_shift); @@ -33,6 +34,14 @@ size_t JitCodeKey(const lstm_attr_t& attr) { return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + attr.use_peephole; } + +template <> +size_t JitCodeKey(const gru_attr_t& attr) { + size_t key = attr.d; + return (key << (act_type_shift * 2)) + static_cast(attr.act_gate) + + (static_cast(attr.act_cand) << act_type_shift); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index e30923c4fd7..78d1cb8f9a7 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -20,3 +20,6 @@ USE_JITKERNEL_REFER(vsigmoid) USE_JITKERNEL_REFER(vtanh) USE_JITKERNEL_REFER(lstmctht) USE_JITKERNEL_REFER(lstmc1h1) +USE_JITKERNEL_REFER(gruh1) +USE_JITKERNEL_REFER(gruhtpart1) +USE_JITKERNEL_REFER(gruhtpart2) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 59b3ce52486..c99174a66f3 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -38,4 +38,8 @@ REGISTER_REFER_KERNEL(vtanh, VTanh); REGISTER_REFER_KERNEL(lstmctht, LSTMCtHt); REGISTER_REFER_KERNEL(lstmc1h1, LSTMC1H1); +REGISTER_REFER_KERNEL(gruh1, GRUH1); +REGISTER_REFER_KERNEL(gruhtpart1, GRUHtPart1); +REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index a93123df9d8..a9a6ffbccd8 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -125,6 +125,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT return nullptr; } +// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels + // compute ct and ht template void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { @@ -195,6 +197,51 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { VMul(gates + d2, gates + d3, ht, d); } +// compute h1 without h0 +template +void GRUH1(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + int d2 = d * 2; + act_gate(gates, gates, d); + act_cand(gates + d2, gates + d2, d); + VMul(gates, gates + d2, ht, d); +} + +// compute the first part of GRU: ht = act_gate(r) * ht_1 +template +void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { + // W: {W_update, W_reset; W_state} + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + act_gate(gates + attr->d, gates + attr->d, attr->d); + VMul(ht_1, gates + attr->d, ht, attr->d); +} + +// compute the second part of GRU: +// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 +template +void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + T* y = gates + d * 2; + act_gate(gates, gates, d); + act_cand(y, y, d); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -219,10 +266,15 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VTanh, XYNTuples); -// lstm_t* , const lstm_attr_t* +// lstm_t*, const lstm_attr_t* DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +// gru_t*, const gru_attr_t* +DECLARE_REFER_KERNEL(GRUH1, GRUTuples); +DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); +DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 03e56416b2f..d994a11f97d 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -485,6 +485,108 @@ TEST(JITKernel, lstmc1h1) { TestLSTMKernel(); } +template +void TestGRUFunc(const typename KernelTuples::func_type tgt, + const std::vector& xsrc, const std::vector& ht_1, + const std::vector& ht_ref, + const paddle::operators::jit::gru_attr_t& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + paddle::operators::jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); +} + +template +void TestGRUKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; + for (int d : TestSizes()) { + for (auto& act_gate : all_acts) { + for (auto& act_cand : all_acts) { + std::string info = act_gate + act_cand + "size_" + std::to_string(d); + const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), + jit::to_kerneltype(act_cand)); + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector xsrc(3 * d), ht_1(d), ht_ref(d); + RandomVec(3 * d, xsrc.data(), -2.f, 2.f); + RandomVec(d, ht_1.data(), -2.f, 2.f); + // x could be changed after compute, so copy to save src + std::vector x(xsrc.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + T* x_data = x.data(); + T* ht_ref_data = ht_ref.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_ref_data; + ref(&step, &attr); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel " << info; + TestGRUFunc>(jitcode, xsrc, ht_1, ht_ref, attr); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel " << info; + TestGRUFunc>(more, xsrc, ht_1, ht_ref, attr); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + TestGRUFunc>(tgt, xsrc, ht_1, ht_ref, attr); + } + } + } +} + +TEST(JITKernel, gruh1) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + +TEST(JITKernel, gruhtpart1) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + +TEST(JITKernel, gruhtpart2) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + // TODO(TJ): refine the tests template TEST(JITKernel, pool) { diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 122cbcb0d6f..d49fc935dc5 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -22,54 +22,7 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace refer { - -// compute h1 without h0 -template -void GRUH1(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - int d2 = d * 2; - act_gate(gates, gates, d); - act_cand(gates + d2, gates + d2, d); - VMul(gates, gates + d2, ht, d); -} - -// compute the first part of GRU: ht = act_gate(r) * ht_1 -template -void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { - // W: {W_update, W_reset; W_state} - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - act_gate(gates + attr->d, gates + attr->d, attr->d); - VMul(ht_1, gates + attr->d, ht, attr->d); -} - -// compute the second part of GRU: -// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 -template -void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - T* y = gates + d * 2; - act_gate(gates, gates, d); - act_cand(y, y, d); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } -} - -} // namespace refer +namespace refer {} // namespace refer } // namespace jitkernel } // namespace math } // namespace operators -- GitLab From fac8702269b2e91891ffccdd684be9d5f91ff31c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 22:39:40 +0800 Subject: [PATCH 0212/2367] adam support multithread --- paddle/fluid/framework/operator.cc | 2 ++ paddle/fluid/framework/operator.h | 3 +++ paddle/fluid/operators/optimizers/adam_op.h | 30 ++++++++++++++++++--- python/paddle/fluid/__init__.py | 3 ++- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 66055e6f1d8..c4ff97948a9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -30,6 +30,8 @@ DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); +DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); +DEFINE_int32(min_param_size_to_use_multithread, 0, ""); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce..175f7975a36 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -34,6 +34,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/variant.h" +DECLARE_int32(inner_op_parallelism); +DECLARE_int32(min_param_size_to_use_multithread); + namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e..aabb71c556a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -352,10 +353,31 @@ class AdamOpKernel : public framework::OpKernel { lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + int inner_op_parallelism = FLAGS_inner_op_parallelism; + if (inner_op_parallelism > 1 && + FLAGS_min_param_size_to_use_multithread > 0 && + param.numel() > FLAGS_min_param_size_to_use_multithread) { + std::vector> fs; + int64_t block_size = param.numel() / inner_op_parallelism; + for (int i = 0; i < inner_op_parallelism; ++i) { + int64_t start = i * block_size; + int64_t end = (i + 1) * block_size; + if (end > param.numel()) { + end = param.numel(); + } + fs.push_back(framework::Async([&functor, start, end]() { + for (int64_t i = start; i < end; ++i) { + functor(i); + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); + } else { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0bb0d1152b..1b24e01c22c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -128,7 +128,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', + 'min_param_size_to_use_multithread' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') -- GitLab From 854ee964e81b2907ca15f201c60e941703f7a909 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 22:41:08 +0800 Subject: [PATCH 0213/2367] add doc string for async_executor.py --- python/paddle/fluid/async_executor.py | 150 +++++++++++++++++++------- 1 file changed, 111 insertions(+), 39 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 099805ac1bd..fe2e9b8f12d 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -89,8 +89,14 @@ class AsyncExecutor(object): self.executor = core.AsyncExecutor(scope, p) self.instance = None - - def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): + def run(self, + program, + data_feed, + filelist, + thread_num, + fetch, + mode="", + debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. Users can also inspect certain variables by naming them in parameter @@ -110,6 +116,7 @@ class AsyncExecutor(object): thread_num(int): number of concurrent training threads. See :code:`Note` for how to set this properly fetch(str|list): the var name or a list of var names to inspect + mode(str): run mode of this interface debug(bool): When set to True, fetch vars will be printed to standard output after each minibatch @@ -154,83 +161,148 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, file_cnt, hadoop_home="$HADOOP_HOME", process_num=12): + def download_data(self, + afs_path, + local_path, + fs_default_name, + ugi, + file_cnt, + hadoop_home="$HADOOP_HOME", + process_num=12): + """ + download_data is a default download method for distributed training + a user download data without this method + + Example: + >>> exe = fluid.AsyncExecutor() + >>> exe.download_data("/xxx/xxx/xx/", + >>> "./data", "afs:// + >>> xxx.xxx.xxx.xxx:9901", "xxx,yyy") + Args: + afs_path(str): afs_path defined by users + local_path(str): download data path + fs_default_name(str): file system server address + ugi(str): hadoop ugi + file_cn(int): a user can specify file number for debugging + hadoop_home(str): hadoop home path + process_num(int): download process num + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') - - configs = { - "fs.default.name": fs_default_name, - "hadoop.job.ugi": ugi - } + raise ValueError('instance is None, please run' + 'config_distributed_nodes init instance') + + configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi} client = hdfs.HDFSClient(hadoop_home, configs) downloads = hdfs.multi_download( client, - afs_path, - local_path, + afs_path, + local_path, self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, file_cnt, multi_processes=process_num) - #self.instance.barrier_all() #wait for download_data #TODO only barriere worker - self.instance.barrier_worker() #wait for download_data #TODO only barriere worker - - def config_distributed_nodes(self): - self.instance = ps_instance.PaddlePSInstance(1, 2) - return self.instance - - # get total rank - # get rank index - # get iplists - # get hadoop info - pass + self.instance.barrier_worker() #wait for download_data def get_instance(self): + """ + get current node's instance so that user can do operations + in distributed setting + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + return self.instance + + def config_distributed_nodes(self): + """ + if a user needs to run distributed async executor + he or she needs to do a global configuration so that + information of current process can be obtained + """ + self.instance = ps_instance.PaddlePSInstance(1, 2) return self.instance def stop(self): + """ + at the end of process, users should call stop to servers + and barrier all workers + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') - self.instance.barrier_worker() #worker do all things + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.instance.barrier_worker() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() - self.instance.barrier_worker() #sync + self.instance.barrier_worker() #sync def init_server(self, dist_desc): + """ + initialize server of current node if current process is a server + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() self.instance.set_ip(ip) - self.instance.barrier_all() #wait all server start + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.gather_servers(ips, self.instance.get_node_cnt()) - self.instance.barrier_all() #wait all worker start + self.instance.barrier_all() #wait all worker start def init_worker(self, dist_desc, startup_program): + """ + initialize worker of current node if current process is a worker + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + startup_program(fluid.Program): startup program of current process + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) place = core.CPUPlace() executor = Executor(place) executor.run(startup_program) - self.instance.barrier_all() #wait all server start + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() - self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) - self.instance.barrier_all() #wait all worker start + self.executor.init_worker(dist_desc, ips, + self.instance.get_node_cnt(), + self.instance._rankid) + self.instance.barrier_all() #wait all worker start if self.instance.is_first_worker(): self.executor.init_model() - self.instance.barrier_worker() #wait init model - + self.instance.barrier_worker() #wait init model + def init_model(self): + """ + init_model command that can be invoked from one of the worker + model parameters are initialized in servers + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.init_model() def save_model(self, save_path): + """ + save_model command that can be invoked from one of the worker + model parameters are saved in servers and upload to save_path of file system + Args: + save_path(str): path to file system + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.save_model(save_path) - -- GitLab From 3759600019f206794d5852bbbc74fd959337cf3d Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 23:01:53 +0800 Subject: [PATCH 0214/2367] add doc string for downpour.py and distribute_lookup_table.py --- .../paddle/fluid/distribute_lookup_table.py | 32 ++++++++++++--- python/paddle/fluid/distributed/downpour.py | 41 ++++++++++++++----- 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index 243d806c41a..74824f68324 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -16,31 +16,51 @@ LOOKUP_TABLE_TYPE = "lookup_table" def find_distributed_lookup_table_inputs(program, table_name): + """ + Find input variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + inputs + """ local_vars = program.current_block().vars inputs = [] for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - inputs.extend( - [local_vars[name] for name in op.input("Ids")]) + inputs.extend([local_vars[name] for name in op.input("Ids")]) return inputs + def find_distributed_lookup_table_outputs(program, table_name): + """ + Find output variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + outputs + """ local_vars = program.current_block().vars outputs = [] for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - outputs.extend( - [local_vars[name] for name in op.output("Out")]) + outputs.extend([local_vars[name] for name in op.output("Out")]) return outputs + def find_distributed_lookup_table(program): """ Find distribute lookup table in program. We only support one distribute table now. - :param program: - :return: table_name or None + Args: + program(Program): given program, locate distributed lookup table + Returns: + table_name or None """ table_name = None diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 9ef9e14ccc5..87dfab92c53 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -20,6 +20,7 @@ from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_i from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs from google.protobuf import text_format + class DownpourSGD(object): """ Distributed optimizer of downpour stochastic gradient descent @@ -35,17 +36,38 @@ class DownpourSGD(object): downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) downpour_sgd.minimize(cost) """ + def __init__(self, learning_rate=0.001, window=1): # todo(guru4elephant): add more optimizers here as argument # todo(guru4elephant): make learning_rate as a variable self.learning_rate_ = learning_rate self.window_ = window self.type = "downpour" - - def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None): - params_grads = sorted(append_backward( - loss, parameter_list, no_grad_set), key=lambda x:x[0].name) + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + DownpounSGD is a distributed optimizer so + that user can call minimize to generate backward + operators and optimization operators within minmize function + Args: + loss(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + Returns: + [ps_param, worker_skipped_ops] + ps_param: parameter server protobuf desc + worker_skipped_ops: operator names that need + to be skipped during execution + """ + params_grads = sorted( + append_backward(loss, parameter_list, no_grad_set), + key=lambda x: x[0].name) table_name = find_distributed_lookup_table(loss.block.program) prefetch_slots = find_distributed_lookup_table_inputs( loss.block.program, table_name) @@ -67,12 +89,12 @@ class DownpourSGD(object): grads.append(i[1]) server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(dense_table_index, self.learning_rate_, - params, grads) + server.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(dense_table_index, self.learning_rate_, - params, grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc()) @@ -80,5 +102,4 @@ class DownpourSGD(object): # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param.trainer_param.skip_op.extend(worker_skipped_ops) - ps_param_str = text_format.MessageToString(ps_param) return [ps_param, worker_skipped_ops] -- GitLab From dc2ff42e20d72a449b200da0522b55a53b28091d Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 13 Dec 2018 23:14:10 +0800 Subject: [PATCH 0215/2367] add math in python examples. test=develop --- python/paddle/fluid/layers/nn.py | 40 +++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 07fc4ccc6bc..4a557ce2471 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2536,16 +2536,28 @@ def adaptive_pool2d(input, ValueError: 'pool_size' should be a list or tuple with length as 2. Examples: - .. code-block:: python + # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], + # output shape is [N, C, m, n], adaptive pool divide H and W dimentions + # of input data into m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(m): + # for j in range(n): + # hstart = floor(i * H / m) + # hend = ceil((i + 1) * H / m) + # wstart = floor(i * W / n) + # wend = ceil((i + 1) * W / n) + # output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend]) + # data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') pool_out = fluid.layers.adaptive_pool2d( input=data, pool_size=[3, 3], - pool_type='max', - require_index=False) + pool_type='avg') """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -2632,16 +2644,32 @@ def adaptive_pool3d(input, ValueError: 'pool_size' should be a list or tuple with length as 2. Examples: - .. code-block:: python + # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], + # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions + # of input data into l * m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(l): + # for j in range(m): + # for k in range(n): + # dstart = floor(i * D / l) + # dend = ceil((i + 1) * D / l) + # hstart = floor(j * H / m) + # hend = ceil((j + 1) * H / m) + # wstart = floor(k * W / n) + # wend = ceil((k + 1) * W / n) + # output[:, :, i, j, k] = + # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) + # data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') pool_out, mask = fluid.layers.adaptive_pool3d( input=data, pool_size=[3, 3], - pool_type='max', - require_index=True) + pool_type='avg') """ if pool_type not in ["max", "avg"]: raise ValueError( -- GitLab From c624417c6f5f1d61ab539aa9c88e95b929a19054 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 11:27:14 +0800 Subject: [PATCH 0216/2367] change sparse mode to lazy mode --- paddle/fluid/operators/optimizers/adam_op.cc | 2 +- paddle/fluid/operators/optimizers/adam_op.h | 12 ++++++------ python/paddle/fluid/optimizer.py | 6 +++--- .../paddle/fluid/tests/unittests/test_adam_op.py | 16 ++++++++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index b2c2e5c3254..79932243278 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -111,7 +111,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "Constant for numerical stability") .SetDefault(1.0e-8f); AddAttr( - "sparse_mode", + "lazy_mode", "(bool, default false) " "only update the parameter that has gradient in sparse update") .SetDefault(false); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 25e23c5f9d4..5870557bb7b 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -177,13 +177,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; - bool sparse_mode_; + bool lazy_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count, bool sparse_mode) + int64_t row_numel, int64_t row_count, bool lazy_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -200,7 +200,7 @@ struct SparseAdamFunctor { rows_(rows), row_numel_(row_numel), row_count_(row_count), - sparse_mode_(sparse_mode) {} + lazy_mode_(lazy_mode) {} inline HOSTDEVICE void adam_update(size_t i, T g) const { // The following code is the same as dense @@ -245,7 +245,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; - bool sparse_mode = ctx.Attr("sparse_mode"); + bool lazy_mode = ctx.Attr("lazy_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -357,8 +357,8 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size(), sparse_mode); - if (sparse_mode) { + grad_merge.rows().size(), lazy_mode); + if (lazy_mode) { size_t row_count = grad_merge.rows().size(); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9c7482bc40d..c53bf4913ad 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -664,7 +664,7 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - sparse_mode=False): + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -677,7 +677,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon - self._sparse_mode = sparse_mode + self._lazy_mode = lazy_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -732,7 +732,7 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "sparse_mode": self._sparse_mode + "lazy_mode": self._lazy_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index da91875a145..461196689c9 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -195,7 +195,7 @@ def adam_step(inputs, attributes): def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, - sparse_mode): + lazy_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -231,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place, sparse_mode): + def setup(self, scope, place, lazy_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -265,19 +265,19 @@ class TestSparseAdamOp(unittest.TestCase): param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, height, rows, row_numel, - np_array, sparse_mode) + np_array, lazy_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place, sparse_mode): + def check_with_place(self, place, lazy_mode): scope = core.Scope() - self.setup(scope, place, sparse_mode) + self.setup(scope, place, lazy_mode) op_args = dict() - op_args['sparse_mode'] = sparse_mode + op_args['lazy_mode'] = lazy_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -313,8 +313,8 @@ class TestSparseAdamOp(unittest.TestCase): if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - for sparse_mode in (True, False): - self.check_with_place(place, sparse_mode) + for lazy_mode in (True, False): + self.check_with_place(place, lazy_mode) if __name__ == "__main__": -- GitLab From eb5d427d3940cd53500fc4003c66ad37ef1738db Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 11:37:39 +0800 Subject: [PATCH 0217/2367] add comment for lazy_mode adam optimizer --- python/paddle/fluid/optimizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index c53bf4913ad..59c22d4e498 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. + regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. + lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators + the accumulators are updated at every step. Every element of the two moving-average is updated + in both dense mode and sparse mode. If the size of parameter is very large, then the update + may be very slow. The lazy mode only update the element that has gradient is the current + mini-batch, so it will be much more faster. But this mode has different semantics with the + original Adam algorithm and may lead to different result. Examples: .. code-block:: python -- GitLab From caa6b596775380b568ff934c24d4c641652e8fcc Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 11:43:23 +0800 Subject: [PATCH 0218/2367] add hdfs_utils & helper & node doc --- .../paddle/fluid/contrib/utils/hdfs_utils.py | 163 +++++++++++++----- python/paddle/fluid/distributed/helper.py | 34 +++- python/paddle/fluid/distributed/node.py | 113 +++++++++--- 3 files changed, 238 insertions(+), 72 deletions(-) diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index 42b4d7feab6..baea57ccce0 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO) class HDFSClient(object): + """ + A tool of HDFS + + Args: + hadoop_home (string): hadoop_home + configs (dict): hadoop config, it is a dict, please contain \ + key "fs.default.name" and "hadoop.job.ugi" + Can be a float value + Examples: + hadoop_home = "/home/client/hadoop-client/hadoop/" + + configs = { + "fs.default.name": "hdfs://xxx.hadoop.com:54310", + "hadoop.job.ugi": "hello,hello123" + } + + client = HDFSClient(hadoop_home, configs) + + client.ls("/user/com/train-25") + files = client.lsr("/user/com/train-25/models") + """ + def __init__(self, hadoop_home, configs): self.pre_commands = [] hadoop_bin = '%s/bin/hadoop' % hadoop_home @@ -55,7 +77,10 @@ class HDFSClient(object): whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( - whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + whole_commands, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) (output, errors) = proc.communicate() ret_code, ret_out, ret_err = proc.returncode, output, errors if ret_code: @@ -69,10 +94,12 @@ class HDFSClient(object): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): """ upload the local file to hdfs - args: - local_file_path: the local file path - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will overwrite the original file + retry_times: max times retry to upload + Returns: True or False """ assert hdfs_path is not None @@ -115,10 +142,12 @@ class HDFSClient(object): def download(self, hdfs_path, local_path, overwrite=False, unzip=False): """ download from hdfs - args: - local_file_path: the local file path - remote_file_path: remote dir on hdfs - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will remove original file and overwrite it. + unzip: ignore this param + Returns True or False """ _logger.info('Downloading %r to %r.', hdfs_path, local_path) @@ -160,11 +189,11 @@ class HDFSClient(object): def is_exist(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + Args: + hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ exist_cmd = ['-test', '-e', hdfs_path] @@ -183,11 +212,11 @@ class HDFSClient(object): def is_dir(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: + Args: remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ @@ -207,15 +236,17 @@ class HDFSClient(object): return True def delete(self, hdfs_path): - """Remove a file or directory from HDFS. - - :param hdfs_path: HDFS path. - :param recursive: Recursively delete files and directories. By default, - this method will raise an :class:`HdfsError` if trying to delete a - non-empty directory. + """ + Remove a file or directory from HDFS. - This function returns `True` if the deletion was successful and `False` if - no file or directory previously existed at `hdfs_path`. + Args: + param hdfs_path: HDFS path. + param recursive: Recursively delete files and directories. By default, + this method will raise an :class:`HdfsError` if trying to delete a + non-empty directory. + Returns: + This function returns `True` if the deletion was successful and `False` if + no file or directory previously existed at `hdfs_path`. """ _logger.info('Deleting %r.', hdfs_path) @@ -241,14 +272,17 @@ class HDFSClient(object): return True def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): - """Move a file or folder. - - :param hdfs_src_path: Source path. - :param hdfs_dst_path: Destination path. If the path already exists and is - a directory, the source will be moved into it. If the path exists and is - a file, or if a parent destination directory is missing, this method will - raise an :class:`HdfsError`. - + """ + Rename a file or folder. + Args: + :param hdfs_src_path: Source path. + :param hdfs_dst_path: Destination path. If the path already exists and is + a directory, the source will be moved into it. If the path exists and is + a file, or if a parent destination directory is missing, this method will + raise an :class:`HdfsError`. + Returns: + This function returns `True` if the rename was successful and `False` if + rename was faild. """ assert hdfs_src_path is not None assert hdfs_dst_path is not None @@ -274,6 +308,11 @@ class HDFSClient(object): @staticmethod def make_local_dirs(local_path): + """ + create a directiory local, is same to mkdir + Args: + local_path: local path that wants to create a directiory. + """ try: os.makedirs(local_path) except OSError as e: @@ -282,9 +321,11 @@ class HDFSClient(object): def makedirs(self, hdfs_path): """Create a remote directory, recursively if necessary. - - :param hdfs_path: Remote path. Intermediate directories will be created - appropriately. + Args: + :param hdfs_path: Remote path. Intermediate directories will be created + appropriately. + Returns: + True if make a directories was successful, False when make a directiries was failed. """ _logger.info('Creating directories to %r.', hdfs_path) assert hdfs_path is not None @@ -304,6 +345,13 @@ class HDFSClient(object): return True def ls(self, hdfs_path): + """ + ls a hdfs_path. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files in the hdfs_path. + """ assert hdfs_path is not None if not self.is_exist(hdfs_path): @@ -329,6 +377,14 @@ class HDFSClient(object): return ret_lines def lsr(self, hdfs_path, only_file=True, sort=True): + """ + ls a hdfs_path sort by time. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files sorted by time in the hdfs_path. + """ + def sort_by_time(v1, v2): v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M') v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M') @@ -372,12 +428,15 @@ def multi_upload(client, multi_processes=5, overwrite=False): """ - :param overwrite: will overwrite hdfs file or not - :param multi_processes: the upload data process at the same time, default=5 - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :return: + Upload file to hdfs. + Args: + :param overwrite: will overwrite hdfs file or not + :param multi_processes: the upload data process at the same time, default=5 + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + Returns: + """ def __subprocess_upload(datas): @@ -387,6 +446,13 @@ def multi_upload(client, client.upload(hdfs_re_path, data, overwrite, retry_times=5) def get_local_files(path): + """ + Get all local files + Args: + path: local file path + Returns: + A list that contation all files in the path. + """ rlist = [] if not os.path.isdir(path): @@ -431,14 +497,17 @@ def multi_download(client, multi_processes=5): """ multi_download - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :param trainer_id: current trainer id - :param trainers: all trainers number - :param file_cnt: all file number - :param multi_processes: the download data process at the same time, default=5 - :return: None + Args: + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + :param trainer_id: current trainer id + :param trainers: all trainers number + :param file_cnt: all file number + :param multi_processes: the download data process at the same time, default=5 + :return: None + Returns: + A list that be downloaded. """ def __subprocess_download(datas): diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 986525e5d85..ca6dd5dabfa 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -15,13 +15,26 @@ from mpi4py import MPI import ps_pb2 as pslib + class FileSystem(object): - def __init__(self, fs_type="afs", + """ + A file system that support async_executor hadoop client desc. + + Args: + fs_type (string): fs_type, for example is "afs" + user (string): hadoop param + passwd (string): hadoop param + hadoop bin (string): hadoop param + Examples: + fs = FileSystm() + """ + + def __init__(self, + fs_type="afs", uri="afs://tianqi.afs.baidu.com:9902", user=None, passwd=None, - hadoop_bin="", - afs_conf=None): + hadoop_bin=""): assert user != None assert passwd != None assert hadoop_bin != None @@ -38,9 +51,22 @@ class FileSystem(object): #self.fs_client.afs_conf = afs_conf if not afs_conf else "" def get_desc(self): + """ + get hadoop desc. + """ return self.fs_client + class MPIHelper(object): + """ + MPIHelper is a wrapper of mpi4py, supprot get_rank get_size etc. + Args: + No params + Examples: + mh = MPIHelper() + mh.get_ip() + """ + def __init__(self): self.comm = MPI.COMM_WORLD @@ -61,5 +87,3 @@ class MPIHelper(object): def finalize(self): MPI.Finalize() - - diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 87553230060..117da9cff82 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -13,17 +13,34 @@ import ps_pb2 as pslib + class Server(object): + """ + A Server basic class. + """ + def __init__(self): pass class Worker(object): + """ + A Worker basic class. + """ + def __init__(self): pass class DownpourServer(Server): + """ + DownpourServer class is used to generate server program_desc + Args: + server: it is pslib.ServerParameter() + Examples: + server = DownpourServer() + """ + def __init__(self): self.server_ = pslib.ServerParameter() self.server_.downpour_server_param.service_param.start_server_port = 0 @@ -33,8 +50,18 @@ class DownpourServer(Server): self.server_.downpour_server_param.service_param.start_server_port = 0 self.server_.downpour_server_param.service_param.server_thread_num = 12 - def add_sparse_table(self, table_id, learning_rate, - slot_key_vars, slot_value_var): + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourSparseTable" @@ -44,10 +71,10 @@ class DownpourServer(Server): table.accessor.sparse_sgd_param.initial_g2sum = 3 table.accessor.sparse_sgd_param.initial_range = 1e-4 table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) - + table.accessor.embedx_dim = 8 table.accessor.embedx_threshold = 5 - table.accessor.fea_dim = 11 + table.accessor.fea_dim = 11 #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, # slot_value_var[0].shape, 1)) table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 @@ -58,53 +85,99 @@ class DownpourServer(Server): table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 table.accessor.downpour_accessor_param.delete_threshold = 0.8 - def add_dense_table(self, table_id, learning_rate, - param_var, grad_var): + def add_dense_table(self, table_id, learning_rate, param_var, grad_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourDenseTable" table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" - table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.name = "adam" table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 - table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 - for param in filter(lambda x: x.name.find("embedding") == -1, param_var): + for param in filter(lambda x: x.name.find("embedding") == -1, + param_var): fea_dim += reduce(lambda x, y: x * y, param.shape, 1) table.accessor.fea_dim = fea_dim def get_desc(self): + """ + Return downpour server program_desc + """ return self.server_ class DownpourWorker(Worker): + """ + DownpourWorker class is used to generate worker program_desc + Args: + window (int): push params frequency + worker: it is pslib.DownpourTrainerParameter + Examples: + worker = DownpourWorker(1) + """ + def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() #self.worker_.pull_dense_per_batch = window #self.worker_.push_dense_per_batch = window - def add_sparse_table(self, table_id, learning_rate, - slot_key_vars, slot_value_vars): + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ table = self.worker_.sparse_table.add() table.table_id = table_id - table.slot_key.extend( - [var.name for var in slot_key_vars]) - table.slot_value.extend( - [var.name for var in slot_value_vars]) + table.slot_key.extend([var.name for var in slot_key_vars]) + table.slot_value.extend([var.name for var in slot_value_vars]) table.slot_gradient.extend( [var.name + "@GRAD" for var in slot_value_vars]) - def add_dense_table(self, table_id, learning_rate, - param_vars, grad_vars): + def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ table = self.worker_.dense_table.add() table.table_id = table_id - table.dense_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [p.name for p in param_vars])) - table.dense_gradient_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [g.name for g in grad_vars])) + table.dense_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [g.name for g in grad_vars])) def get_desc(self): + """ + Return downpour worker program_desc + """ return self.worker_ -- GitLab From b17444c84c45ae7f04863964099412a8ad9bf8d0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 14 Dec 2018 12:33:05 +0800 Subject: [PATCH 0219/2367] Fix merge bug test=develop --- paddle/fluid/operators/psroi_pool_op.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index 6978d9c5dc5..78989582b7a 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -129,9 +129,8 @@ class PSROIPoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -150,9 +149,8 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; -- GitLab From 35ce6ac2e6f1d71da55da74e49212fdbb2a61e79 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 0220/2367] add ps_instance doc --- .../paddle/fluid/distributed/ps_instance.py | 58 ++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b93da053a36..6b44d0cd16f 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -14,27 +14,36 @@ import helper as dist_helper import sys + class PaddlePSInstance(object): + """ + PaddlePSInstance class is used to generate A instance of server or worker + Args: + server_worker_mode: is a value 0 or 1, default is 1 + proc_per_node: process per node, default is 2 + Examples: + instance = PaddlePSInstance(1, 2) + """ + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node self._nodes = self.dh.get_size() - + self._ip = 0 self._worker_num = self._nodes * self._proc_per_node / 2 self._server_num = self._nodes * self._proc_per_node / 2 self._total_server_worker = self._worker_num + self._server_num - self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 self._set_nodetype() self._comm = None self._split_comm() - def _set_nodetype(self): if self._server_worker_mode == 0: - if self._rankid < self._server_num: + if self._rankid < self._server_num: self._node_type = 1 elif self._rankid < self._total_server_worker: self._node_type = 0 @@ -46,13 +55,13 @@ class PaddlePSInstance(object): self._node_type = 0 else: self._node_type = 1 - else: - self._node_type = -1; + else: + self._node_type = -1 else: self._node_type = -1 - + #if self._rankid == 0: - #print "node type: ", self._node_type + #print "node type: ", self._node_type def _split_comm(self): if self.is_server(): @@ -62,45 +71,78 @@ class PaddlePSInstance(object): pass def get_worker_index(self): + """ + Return worker index + """ if self._server_worker_mode == 0: return self._rankid == self.server_num else: return self._rankid / self._proc_per_node def get_server_index(self): + """ + Return server index + """ if self._server_worker_mode == 0: return self.rank_id else: return self.rank_id / self._proc_per_node def is_worker(self): + """ + Return instance is worker or not + """ return self._node_type == 1 def is_server(self): + """ + Return instance is server or not + """ return self._node_type == 0 def is_first_worker(self): + """ + Return instance is first worker or not + """ return self.is_worker() and 0 == self.get_worker_index() def set_ip(self, ip): + """ + set server ip + """ self._ip = ip def gather_ips(self): + """ + Return all servers and workers ip throught mpi allgather + """ self._ips = self.dh.comm.allgather(self._ip) return self._ips def get_node_cnt(self): + """ + Return node cnt + """ return self._nodes def barrier_all(self): + """ + barrier workers and servers + """ self.dh.comm.barrier() def barrier_worker(self): + """ + barrier workers + """ if self.is_worker(): self._comm.barrier() pass def finalize(self): + """ + MPI finalize + """ self.dh.finalize() pass -- GitLab From f16aa394f61cd759562ecfb5d3553700932d71b1 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 14 Dec 2018 12:14:37 +0800 Subject: [PATCH 0221/2367] remove use_cudnn in python API. test=develop --- paddle/fluid/API.spec | 4 ++-- python/paddle/fluid/layers/nn.py | 22 ---------------------- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 845abe7d5b8..d67363003a7 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4a557ce2471..28a119906bb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2506,7 +2506,6 @@ def adaptive_pool2d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=False, name=None): """ ${comment} @@ -2521,7 +2520,6 @@ def adaptive_pool2d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2530,8 +2528,6 @@ def adaptive_pool2d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: 'use_cudnn' is not a bool value. - ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2575,12 +2571,6 @@ def adaptive_pool2d(input, raise ValueError( "'pool_size' should be a list or tuple with length as 2.") - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False.") - - if use_cudnn: - raise ValueError("adaptive pool currently not supported in cudnn.") - if pool_type == "max": l_type = 'max_pool2d_with_index' else: @@ -2602,7 +2592,6 @@ def adaptive_pool2d(input, attrs={ "pooling_type": pool_type, "ksize": pool_size, - "use_cudnn": use_cudnn, "adaptive": True, }) @@ -2614,7 +2603,6 @@ def adaptive_pool3d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=False, name=None): """ ${comment} @@ -2629,7 +2617,6 @@ def adaptive_pool3d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2638,8 +2625,6 @@ def adaptive_pool3d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: 'use_cudnn' is not a bool value. - ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2687,12 +2672,6 @@ def adaptive_pool3d(input, raise ValueError( "'pool_size' should be a list or tuple with length as 3.") - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False.") - - if use_cudnn: - raise ValueError("adaptive pool currently not supported in cudnn.") - if pool_type == "max": l_type = 'max_pool3d_with_index' else: @@ -2714,7 +2693,6 @@ def adaptive_pool3d(input, attrs={ "pooling_type": pool_type, "ksize": pool_size, - "use_cudnn": use_cudnn, "adaptive": True, }) -- GitLab From da796dfe05eb48c3f5541a43d8fb193153333ca0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 13:29:02 +0800 Subject: [PATCH 0222/2367] Remove BinarySearch from Adam Op test=develop --- .../operators/math/selected_rows_functor.cc | 4 + .../operators/math/selected_rows_functor.h | 7 ++ paddle/fluid/operators/optimizers/adam_op.h | 119 +++++++++++++++--- 3 files changed, 114 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 3eba268cfa9..0c2e6d40241 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -301,6 +302,9 @@ struct MergeAdd { } std::vector merge_rows(merged_row_set.begin(), merged_row_set.end()); + if (sorted_result_) { + std::sort(merge_rows); + } std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { rows_to_id[merge_rows[i]] = i; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 6d146d39d6d..b7b19f130e5 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -78,6 +78,10 @@ namespace scatter { // functors for manuplating SelectedRows data template struct MergeAdd { + MergeAdd() : sorted_result_(false) {} + + explicit MergeAdd(bool sorted_result) : sorted_result_(sorted_result) {} + // unary functor, merge by adding duplicated rows in // the input SelectedRows object. framework::SelectedRows operator()(const DeviceContext& context, @@ -88,6 +92,9 @@ struct MergeAdd { void operator()(const DeviceContext& context, const std::vector& inputs, framework::SelectedRows* output); + + private: + bool sorted_result_; }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e..c2bf7040d77 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -158,7 +158,7 @@ struct AdamFunctor { }; template -struct SparseAdamFunctor { +struct SparseAdamFunctor { T beta1_; T beta2_; T epsilon_; @@ -227,6 +227,78 @@ struct SparseAdamFunctor { } }; +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel, int64_t row_count) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + inline void operator()(size_t numel) const { + // lr could be reuse + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + for (size_t i = 0U, j = 0U; i != numel; ++i) { + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T p = param_[i]; + + // Calculation + if (i == *(rows_ + j)) { + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + ++j; + } else { + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + } + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } + } +}; + template class AdamOpKernel : public framework::OpKernel { public: @@ -316,7 +388,7 @@ class AdamOpKernel : public framework::OpKernel { } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; + scatter::MergeAdd merge_func(true); auto* grad_merge_var = const_cast(ctx.scope()) .Var() ->GetMutable(); @@ -337,25 +409,40 @@ class AdamOpKernel : public framework::OpKernel { } else { #endif rows = grad_merge.rows().data(); - #if defined(PADDLE_WITH_CUDA) } #endif auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - SparseAdamFunctor functor( - beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + if (platform::is_cpu_place(ctx.GetPlace())) { + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size()); + + functor(param.numel()); + } else if (platform::is_gpu_place(ctx.GetPlace())) { + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size()); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } -- GitLab From bafd823666bde1098cf07eb23d406bc9780c7b28 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 0223/2367] test --- .../paddle/fluid/distributed/ps_instance.py | 58 ++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b93da053a36..6b44d0cd16f 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -14,27 +14,36 @@ import helper as dist_helper import sys + class PaddlePSInstance(object): + """ + PaddlePSInstance class is used to generate A instance of server or worker + Args: + server_worker_mode: is a value 0 or 1, default is 1 + proc_per_node: process per node, default is 2 + Examples: + instance = PaddlePSInstance(1, 2) + """ + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node self._nodes = self.dh.get_size() - + self._ip = 0 self._worker_num = self._nodes * self._proc_per_node / 2 self._server_num = self._nodes * self._proc_per_node / 2 self._total_server_worker = self._worker_num + self._server_num - self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 self._set_nodetype() self._comm = None self._split_comm() - def _set_nodetype(self): if self._server_worker_mode == 0: - if self._rankid < self._server_num: + if self._rankid < self._server_num: self._node_type = 1 elif self._rankid < self._total_server_worker: self._node_type = 0 @@ -46,13 +55,13 @@ class PaddlePSInstance(object): self._node_type = 0 else: self._node_type = 1 - else: - self._node_type = -1; + else: + self._node_type = -1 else: self._node_type = -1 - + #if self._rankid == 0: - #print "node type: ", self._node_type + #print "node type: ", self._node_type def _split_comm(self): if self.is_server(): @@ -62,45 +71,78 @@ class PaddlePSInstance(object): pass def get_worker_index(self): + """ + Return worker index + """ if self._server_worker_mode == 0: return self._rankid == self.server_num else: return self._rankid / self._proc_per_node def get_server_index(self): + """ + Return server index + """ if self._server_worker_mode == 0: return self.rank_id else: return self.rank_id / self._proc_per_node def is_worker(self): + """ + Return instance is worker or not + """ return self._node_type == 1 def is_server(self): + """ + Return instance is server or not + """ return self._node_type == 0 def is_first_worker(self): + """ + Return instance is first worker or not + """ return self.is_worker() and 0 == self.get_worker_index() def set_ip(self, ip): + """ + set server ip + """ self._ip = ip def gather_ips(self): + """ + Return all servers and workers ip throught mpi allgather + """ self._ips = self.dh.comm.allgather(self._ip) return self._ips def get_node_cnt(self): + """ + Return node cnt + """ return self._nodes def barrier_all(self): + """ + barrier workers and servers + """ self.dh.comm.barrier() def barrier_worker(self): + """ + barrier workers + """ if self.is_worker(): self._comm.barrier() pass def finalize(self): + """ + MPI finalize + """ self.dh.finalize() pass -- GitLab From bd1c1724aabc2b1d2d30ae6ac159df297b6c7f54 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 0224/2367] add ps_instance doc --- paddle/fluid/framework/CMakeLists.txt | 15 +- paddle/fluid/framework/async_executor.cc | 114 ++++++------ .../fluid/framework/executor_thread_worker.cc | 165 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 124 ++++++------- 4 files changed, 199 insertions(+), 219 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f3d66cd8830..ab237f768a9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,6 +1,6 @@ -# windows treat symbolic file as a real file, which is different with unix -# We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) @@ -11,7 +11,7 @@ function(windows_symbolic TARGET) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - # only copy the xx.cu to .xx.cu when the content are modified +#only copy the xx.cu to.xx.cu when the content are modified set(copy_flag 1) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) @@ -32,7 +32,7 @@ endfunction() add_subdirectory(ir) add_subdirectory(details) -# ddim lib +#ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) @@ -89,8 +89,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) - # windows treat symbolic file as a real file, which is different with unix - # We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -137,7 +137,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) -# Generate an empty __init__.py to make framework_py_proto as a valid python module. +#Generate an empty \ + __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 0fe7f3bd5c0..e2756cafa20 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" #ifdef PADDLE_WITH_PSLIB -#include "pslib.h" +#include #endif namespace paddle { @@ -70,50 +70,52 @@ void PrepareReaders(std::vector>& readers, // NOLINT #ifdef PADDLE_WITH_PSLIB void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { - _pslib_ptr = - std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_server(dist_desc, index); - InitParamConfig(); + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); + InitParamConfig(); } void AsyncExecutor::InitWorker(const std::string& dist_desc, const std::vector& host_sign_list, int node_num, int index) { - _pslib_ptr = std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_worker( - dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index); + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker(dist_desc, + static_cast(host_sign_list.data()), + node_num, index); - InitParamConfig(); + InitParamConfig(); } -uint64_t AsyncExecutor::StartServer() { - return _pslib_ptr->run_server(); -} +uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } -void AsyncExecutor::StopServer() { - _pslib_ptr->stop_server(); -} +void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } -void AsyncExecutor::GatherServers( - const std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num); +void AsyncExecutor::GatherServers(const std::vector& host_sign_list, + int node_num) { + _pslib_ptr->gather_servers(static_cast(host_sign_list.data()), + node_num); } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < - _pslib_ptr->get_param()->server_param(). \ - downpour_server_param(). \ - downpour_table_param_size(); + for (int i = 0; i < _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param_size(); ++i) { - if (_pslib_ptr->get_param()->server_param(). \ - downpour_server_param().downpour_table_param(i). \ - table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param(). \ - downpour_server_param(). \ - downpour_table_param(i). \ - accessor().fea_dim(); + if (_pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .table_class() + .find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .accessor() + .fea_dim(); break; } } @@ -122,28 +124,24 @@ void AsyncExecutor::InitParamConfig() { _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); _param_config.tmp_push_sparse_wait_times = static_cast( _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { _param_config.skip_op.push_back( _pslib_ptr->get_param()->trainer_param().skip_op(t)); } - + for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); - ++t) { + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = - table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { - tmp_sparse_gradient_variable_name.push_back( - table.slot_gradient(i)); + tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i)); } _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); @@ -151,10 +149,9 @@ void AsyncExecutor::InitParamConfig() { std::move(tmp_sparse_gradient_variable_name); _param_config.sparse_table_id.push_back(table.table_id()); } - + for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); - ++t) { + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); std::vector tmp_dense_variable_name; for (int i = 0u; i < table.dense_variable_name_size(); ++i) { @@ -181,26 +178,25 @@ void AsyncExecutor::InitModel() { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); - + float* g = tensor->data(); CHECK(g != nullptr) << "var[" << t << "] value not initialized"; float init_range = 0.2; int rown = tensor->dims()[0]; init_range /= sqrt(rown); - + std::normal_distribution ndistr(0.0, 1.0); for (auto i = 0u; i < tensor->numel(); ++i) { g[i] = ndistr(local_random_engine()) * init_range; } - + paddle::ps::Region reg(g, tensor->numel()); regions.emplace_back(std::move(reg)); } - - auto push_status = - _pslib_ptr->_worker_ptr->push_dense_param( - regions.data(), regions.size(), table_id); + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); push_status.wait(); auto status = push_status.get(); if (status != 0) { @@ -225,14 +221,14 @@ void AsyncExecutor::SaveModel(const std::string& path) { void AsyncExecutor::PrepareDenseThread(const std::string& mode) { if (mode == "mpi") { DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; + param.ps_client = _pslib_ptr->_worker_ptr; param.threshold = 1; param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr( - new DensePullThread(param)); + + _pull_dense_thread = + std::shared_ptr(new DensePullThread(param)); _pull_dense_thread->start(); } } @@ -243,8 +239,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, - const std::string& mode, - const bool debug) { + const std::string& mode, const bool debug) { std::vector threads; auto& block = main_program.Block(0); @@ -293,9 +288,9 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& worker : workers) { #ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { - worker.reset(new AsyncExecutorThreadWorker); + worker.reset(new AsyncExecutorThreadWorker); } else { - worker.reset(new ExecutorThreadWorker); + worker.reset(new ExecutorThreadWorker); } #else worker.reset(new ExecutorThreadWorker); @@ -308,7 +303,6 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, fetch_var_names, root_scope_, thidx, debug); } - // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { threads.push_back( diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 59679842bc1..a9455629260 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" +#include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" @@ -51,7 +52,7 @@ void DensePullThread::run() { if (_pull_dense_status.size() != 0) { wait_all(); } - + usleep(_sleep_time_ms * 1000); } } @@ -77,12 +78,12 @@ std::future DensePullThread::pull_dense(uint64_t table_id) { regions.clear(); auto& variables = _dense_variable_name[table_id]; regions.resize(variables.size()); - + for (auto i = 0u; i < variables.size(); ++i) { auto& t = variables[i]; Variable* var = _root_scope->FindVar(t); LoDTensor* tensor = var->GetMutable(); - + float* w = tensor->data(); paddle::ps::Region reg(w, tensor->numel()); regions[i] = std::move(reg); @@ -95,21 +96,20 @@ void DensePullThread::wait_all() { t.wait(); auto status = t.get(); if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << - ++_pull_dense_fail_times; + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; } } - + if (_pull_dense_fail_times > 20) { LOG(FATAL) << "pull dense failed times more than 20 times"; exit(-1); } - + _pull_dense_status.resize(0); } -void DensePullThread::increase_thread_version( - int thread_id, uint64_t table_id) { +void DensePullThread::increase_thread_version(int thread_id, + uint64_t table_id) { std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } @@ -174,7 +174,6 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } - void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ return; @@ -344,15 +343,14 @@ void AsyncExecutorThreadWorker::SetPullDenseThread( } void AsyncExecutorThreadWorker::TrainOneNetwork() { PrepareParams(); - + for (auto& op : ops_) { if (op->Type().find("sgd") != std::string::npos) { continue; } bool need_skip = false; for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != - std::string::npos) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { need_skip = true; break; } @@ -436,14 +434,13 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { paddle::ps::Region reg(g, count); regions.emplace_back(std::move(reg)); } - - auto status = _pslib_ptr->_worker_ptr->push_dense( - regions.data(), regions.size(), table_id); + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); _push_dense_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; auto fea_dim = _param_config->fea_dim; @@ -451,8 +448,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.clear(); features.resize(0); features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = - thread_reader_->GetUseSlotAlias(); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); @@ -468,20 +464,20 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.push_back(static_cast(ids[i])); } } - check_pull_push_memory(features, feature_value, fea_dim); - + check_pull_push_memory(features, &feature_value, fea_dim); + std::vector pull_feature_value; for (auto i = 0u; i < features.size(); ++i) { pull_feature_value.push_back(feature_value[i].data()); } - + auto status = _pslib_ptr->_worker_ptr->pull_sparse( pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); - + auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - + check_pull_push_memory(features, &push_g, fea_dim); + collect_feasign_info(table_id); } @@ -490,15 +486,14 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& fea_value = _feature_value[table_id]; - + CHECK(features.size() > 0) << "feature size check failed"; - + auto fea_idx = 0u; - + std::vector init_value(fea_dim); - - const std::vector& feed_vec = - thread_reader_->GetUseSlotAlias(); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); @@ -508,22 +503,22 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { Variable* var_emb = thread_scope_->FindVar( _param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->mutable_data( - {len, slot_dim}, platform::CPUPlace()); + float* ptr = + tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); memset(ptr, 0, sizeof(float) * len * slot_dim); auto& tensor_lod = tensor->lod()[0]; - + LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); - + for (auto index = 0u; index < len; ++index) { if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, - init_value.data() + 2, sizeof(float) * slot_dim); + memcpy(ptr + slot_dim * index, init_value.data() + 2, + sizeof(float) * slot_dim); continue; } - memcpy(ptr + slot_dim * index, - fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, + sizeof(float) * slot_dim); fea_idx++; } } @@ -534,35 +529,38 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - CHECK(push_g.size() == features.size() + 1) << - "push_g size:" << push_g.size() << " features size:" << features.size(); + check_pull_push_memory(features, &push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) + << "push_g size:" << push_g.size() + << " features size:" << features.size(); uint64_t fea_idx = 0u; auto& fea_info = _fea_info[table_id]; int offset = 2; const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label + // slot_idx = 0 is label for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table.find( - feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { - LOG(ERROR) << "ERROR slot_idx:" << slot_idx << - " name:" << feed_vec[slot_idx]; - } else if ( - _param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == + _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx + << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != + table_id) { continue; } Variable* g_var = thread_scope_->FindVar( _param_config->gradient_var[table_id][slot_idx - 1]); - CHECK(g_var != nullptr) << "var[" << - _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + CHECK(g_var != nullptr) + << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; LoDTensor* g_tensor = g_var->GetMutable(); if (g_tensor == NULL) { - LOG(ERROR) << "var[" << - _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + LOG(ERROR) << "var[" + << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; exit(-1); } float* g = g_tensor->data(); - + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -571,42 +569,40 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { exit(-1); } int len = tensor->numel(); - CHECK(slot_dim * len == g_tensor->numel()) << - "len:" << len << " g_numel:" << g_tensor->numel(); - CHECK(len == tensor->numel()) << "len:" << - len << "t_numel:" << tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) + << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len + << "t_numel:" << tensor->numel(); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx) { if (ids[id_idx] == 0) { g += slot_dim; continue; } - memcpy(push_g[fea_idx].data() + offset, - g, sizeof(float) * slot_dim); + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); push_g[fea_idx][0] = 1.0f; - CHECK(fea_idx < fea_info.size()) << "fea_idx:" << - fea_idx << " size:" << fea_info.size(); + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx + << " size:" << fea_info.size(); push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); g += slot_dim; fea_idx++; } } - CHECK(fea_idx == features.size()) << "fea_idx:" << - fea_idx << " features size:" << features.size(); + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx + << " features size:" << features.size(); CHECK_GT(features.size(), 0); - + std::vector push_g_vec; for (auto i = 0u; i < features.size(); ++i) { push_g_vec.push_back(push_g[i].data()); } auto status = _pslib_ptr->_worker_ptr->push_sparse( - table_id, features.data(), - (const float**)push_g_vec.data(), features.size()); + table_id, features.data(), (const float**)push_g_vec.data(), + features.size()); _push_sparse_status.push_back(std::move(status)); } -void AsyncExecutorThreadWorker::collect_feasign_info( - int table_id) { +void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) { auto& fea_info = _fea_info[table_id]; auto& feature = _features[table_id]; fea_info.resize(feature.size()); @@ -614,13 +610,13 @@ void AsyncExecutorThreadWorker::collect_feasign_info( Variable* var = thread_scope_->FindVar(feed_vec[0]); LoDTensor* tensor = var->GetMutable(); int64_t* label = tensor->data(); - + int global_index = 0; for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); int64_t* ids = tensor->data(); - + int fea_idx = 0; for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { @@ -628,36 +624,33 @@ void AsyncExecutorThreadWorker::collect_feasign_info( continue; } FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; - + fea_info[global_index++] = std::move(info); } } } - CHECK(global_index == feature.size()) << - "expect fea info size:" << feature.size() - << " real:" << global_index; + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector>& push_g, - int dim) { - push_g.resize(features.size() + 1); - for (auto& t : push_g) { + const std::vector& features, + std::vector>* push_g, int dim) { + push_g->resize(features.size() + 1); + for (auto& t : *push_g) { t.resize(dim); } } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector& push_g, + const std::vector& features, std::vector* push_g, int dim) { - if (features.size() > push_g.size()) { - push_g.reserve(features.size() + 1); - auto size = features.size() - push_g.size() + 1; + if (features.size() > push_g->size()) { + push_g->reserve(features.size() + 1); + auto size = features.size() - push_g->size() + 1; for (auto i = 0u; i < size; ++i) { float* ptr = new float[dim]; - push_g.push_back(ptr); + push_g->push_back(ptr); } } } diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 20410b4c069..30b81ad8803 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #ifdef PADDLE_WITH_PSLIB -#include "pslib.h" +#include #endif namespace paddle { @@ -34,75 +34,74 @@ namespace framework { void CreateTensor(Variable* var, proto::VarType::Type var_type); #ifdef PADDLE_WITH_PSLIB -const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; +static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; struct AsyncWorkerParamConfig { int slot_dim; int fea_dim; int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; - + std::vector skip_op; - + std::map> dense_variable_name; std::map> dense_gradient_variable_name; - std::vector dense_table_id; + std::vector dense_table_id; // fea_dim for each dense table - std::vector dense_table_size; - std::vector sparse_table_id; + std::vector dense_table_size; + std::vector sparse_table_id; std::map> slot_input_vec; std::map> gradient_var; std::map slot_alias_to_table; }; struct DensePullThreadParam { - std::shared_ptr ps_client; - int threshold; - int training_thread_num; - Scope* root_scope; - std::map>* dense_params; - int sleep_time_ms = 2; + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; }; class DensePullThread { public: - explicit DensePullThread(const DensePullThreadParam& param) : - _running(false) { + explicit DensePullThread(const DensePullThreadParam& param) + : _running(false) { _ps_client = param.ps_client; _threshold = param.threshold; _thread_num = param.training_thread_num; _root_scope = param.root_scope; _sleep_time_ms = param.sleep_time_ms; - + for (auto& t : *param.dense_params) { - _dense_variable_name[t.first].insert( - _dense_variable_name[t.first].end(), - t.second.begin(), t.second.end()); + _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); _training_versions[t.first].resize(_thread_num, 0); _last_versions[t.first] = 0; _current_version[t.first] = 0; } } - + int start(); - + void stop() { if (_running) { _running = false; _t.join(); } } - + void increase_thread_version(int thread_id, uint64_t table_id); void reset_thread_version(uint64_t table_id); std::future pull_dense(uint64_t table_id); void pull_dense2(uint64_t table_id); void wait_all(); - + private: void run(); bool check_update_param(uint64_t table_id); - + private: std::shared_ptr _ps_client; int _thread_num; @@ -113,33 +112,33 @@ class DensePullThread { std::map _last_versions; std::map _current_version; - std::mutex _mutex_for_version; + std::mutex _mutex_for_version; std::map> _training_versions; std::map> _dense_variable_name; - + std::thread _t; - + std::vector<::std::future> _pull_dense_status; - + std::map> _regions; - uint32_t _pull_dense_fail_times = 0; - - std::vector _base_norm_param; - std::vector _mean; - std::vector _scale; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; float _squared_sum_epsilon = 1e-4; std::mutex _mutex_for_mean_scale; - + float _total_batch_num = 0; }; #endif class ExecutorThreadWorker { public: -ExecutorThreadWorker() - : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} + ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} virtual ~ExecutorThreadWorker() {} - + void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); void SetThreadId(int tid); @@ -161,10 +160,8 @@ ExecutorThreadWorker() #ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( std::shared_ptr pslib_ptr) {} - virtual void SetPullDenseThread( - std::shared_ptr dpt) {} - virtual void SetParamConfig( - AsyncWorkerParamConfig * param_config) {} + virtual void SetPullDenseThread(std::shared_ptr dpt) {} + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {} #endif private: @@ -195,7 +192,7 @@ ExecutorThreadWorker() }; #ifdef PADDLE_WITH_PSLIB -class AsyncExecutorThreadWorker: public ExecutorThreadWorker { +class AsyncExecutorThreadWorker : public ExecutorThreadWorker { public: AsyncExecutorThreadWorker() {} virtual ~AsyncExecutorThreadWorker() {} @@ -210,40 +207,35 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { void FillSparse(int table_id); void PushSparse(int table_id); void PushDense(int table_id); - - void check_pull_push_memory( - const std::vector& features, - std::vector& push_g, - int dim); + void check_pull_push_memory(const std::vector& features, - std::vector>& push_g, - int dim); + std::vector* push_g, int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>* push_g, int dim); void collect_feasign_info(int table_id); - + private: struct FeasignInfo { uint32_t slot; uint32_t ins; int64_t label; }; - - std::map> _features; - std::map> _fea_info; + + std::map> _features; + std::map> _fea_info; std::map>> _feature_value; std::map>> _feature_push_value; - - - std::shared_ptr _pslib_ptr; - - std::shared_ptr _pull_dense_thread; - - std::vector<::std::future> _pull_sparse_status; - std::vector<::std::future> _pull_dense_status; - std::vector<::std::future> _push_sparse_status; - std::vector<::std::future> _push_dense_status; - - AsyncWorkerParamConfig* _param_config; - + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; }; #endif -- GitLab From 23dec787723f6906077e77b2d15820a78bde1344 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 14 Dec 2018 13:54:50 +0800 Subject: [PATCH 0225/2367] fix script issue test=develop --- cmake/external/snappy.cmake | 16 ++- cmake/external/xxhash.cmake | 7 +- cmake/external/zlib.cmake | 16 ++- cmake/inference_lib.cmake | 92 ++++++++------- paddle/fluid/framework/CMakeLists.txt | 36 +++--- paddle/fluid/inference/CMakeLists.txt | 9 -- .../inference/api/demo_ci/CMakeLists.txt | 106 +++++++++--------- .../detection/density_prior_box_op.cu | 3 +- .../detection/roi_perspective_transform_op.cu | 4 +- paddle/fluid/platform/CMakeLists.txt | 7 ++ paddle/fluid/pybind/CMakeLists.txt | 4 - python/setup.py.in | 3 +- 12 files changed, 157 insertions(+), 146 deletions(-) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index b30403d2d81..f9d4cd97400 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -if (WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) - ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -56,6 +50,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command(TARGET extern_snappy POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib + ) + ENDIF() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 4c2d64f6274..c3e1212d8f8 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -56,7 +56,12 @@ else() endif() if (WIN32) - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") + IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") + add_custom_command(TARGET extern_xxhash POST_BUILD + COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib + ) + ENDIF() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") else() set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") endif () diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d73235453..d3507375372 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) - INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. @@ -49,6 +43,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib") + add_custom_command(TARGET extern_zlib POST_BUILD + COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib + ) + ENDIF() + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d8..5aa7a8a752f 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -32,24 +32,35 @@ function(copy TARGET) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) if (WIN32) - # windows cmd shell will not expand wildcard automatically. - # below expand the files,libs and copy them by rules. - file(GLOB header_files ${src} "*.h") - file(GLOB static_lib_files ${src} "*.lib") - file(GLOB dll_lib_files ${src} "*.dll") - set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) - - if (NOT "${src_files}" STREQUAL "") - list(REMOVE_DUPLICATES src_files) - endif () - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - foreach (src_file ${src_files}) + if(IS_DIRECTORY ${src}) + get_filename_component(last_path ${src} NAME) + string(APPEND dst "/" ${last_path}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + if(EXISTS ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND cmake -E copy_directory "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + else() + message(WARNING "${src} not exist!") + endif() + else() + # windows cmd shell will not expand wildcard automatically. + # below expand the files, and copy them by rules. + file(GLOB src_files ${src}) + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" - COMMENT "copying ${src_file} -> ${dst}") - endforeach () + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + endif() else (WIN32) # not windows add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}" @@ -95,7 +106,7 @@ copy(xxhash_lib DEPS xxhash ) -if (NOT PROTOBUF_FOUND) +if (NOT PROTOBUF_FOUND OR WIN32) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} @@ -138,27 +149,25 @@ if (WITH_NGRAPH) ) endif () -if (NOT WIN32) - if (NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +if (NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) - endif () -endif (NOT WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) +endif () # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -192,8 +201,13 @@ if (WITH_ANAKIN AND WITH_MKL) endif () set(module "inference") +if(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) +else(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) +endif(WIN32) copy(inference_lib DEPS ${inference_deps} - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} ${src_dir}/${module}/api/paddle_*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -233,7 +247,7 @@ copy(third_party DEPS fluid_lib_dist # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library copy(inference_api_lib DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${paddle_fluid_lib} ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9e..eb4f1a816fc 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -3,30 +3,22 @@ # We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS PATH) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - - # only copy the xx.cu to .xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 058a5b5f460..b80e7ef752c 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -26,9 +26,6 @@ endif(WIN32) # paddle_fluid_origin exclude inference api interface if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) @@ -63,9 +57,6 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) target_link_libraries(paddle_fluid_shared shlwapi) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index ec93729cd2b..8d0d96d391e 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -15,12 +15,43 @@ macro(safe_set_static_flag) endforeach(flag_var) endmacro() +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +include_directories("${PADDLE_LIB}/fluid_inference_install_dir/") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") if (WITH_STATIC_LIB) safe_set_static_flag() add_definitions(-DSTATIC_LIB) - set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w") - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w") endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") else() @@ -29,36 +60,15 @@ else() endif() message("flags" ${CMAKE_CXX_FLAGS}) -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") -endif() -if(NOT DEFINED DEMO_NAME) - message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") -endif() - - if(WITH_GPU) if(NOT WIN32) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") endif() endif(NOT WIN32) endif() -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -if (NOT WIN32) -include_directories("${PADDLE_LIB}/third_party/install/snappy/include") -include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") -endif(NOT WIN32) - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") if (NOT WIN32) if (USE_TENSORRT AND WITH_GPU) @@ -67,18 +77,6 @@ if (NOT WIN32) endif() endif(NOT WIN32) -if (NOT WIN32) -link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") -link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") -endif(NOT WIN32) - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/paddle/lib") - if (NOT WIN32) set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") if(EXISTS ${NGRAPH_PATH}) @@ -89,8 +87,6 @@ if (NOT WIN32) endif() endif() -add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) - if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} @@ -106,26 +102,25 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if (NOT WIN32) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} - glog gflags protobuf snappystream snappy z xxhash - ${EXTERNAL_LIB}) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} + glog gflags protobuf snappystream snappy z xxhash + ${EXTERNAL_LIB}) else() -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf - ${EXTERNAL_LIB}) -# NOTE(dzhwinter) shlwapi is deprecated. -set(DEPS ${DEPS} libcmt shlwapi) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf + ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash + snappystream ${EXTERNAL_LIB}) + # NOTE(dzhwinter) shlwapi is deprecated. + set(DEPS ${DEPS} libcmt shlwapi) endif(NOT WIN32) if(WITH_GPU) @@ -137,9 +132,10 @@ if(WITH_GPU) set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) endif() endif() +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 3b7c781795f..6a92762896b 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. - int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int blockx = std::min( + static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 2d262f932ae..862d664d42e 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -35,12 +35,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; } template diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 23c7ebe8422..2f205e1d5ca 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -67,6 +67,13 @@ ENDIF() # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) +if(WIN32) + if(WITH_GPU AND NOT WITH_DSO) + get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) + target_link_libraries(device_context ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +endif(WIN32) + nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) cc_test(init_test SRCS init_test.cc DEPS device_context) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb1262..c79d5d9403d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -19,10 +19,6 @@ if(WITH_PYTHON) endif(WITH_AMD_GPU) if(WIN32) - if(WITH_GPU AND NOT WITH_DSO) - get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) - target_link_libraries(paddle_pybind ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) target_link_libraries(paddle_pybind shlwapi) endif(WIN32) diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c7..65620466412 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -241,5 +241,6 @@ setup(name='${PACKAGE_NAME}', ext_modules=ext_modules, package_data=package_data, package_dir=package_dir, - scripts=paddle_bins + scripts=paddle_bins, + distclass=BinaryDistribution ) -- GitLab From 58110921bd84af43ed08a955c515aadd1558bac3 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 14:16:43 +0800 Subject: [PATCH 0226/2367] fix CMakeList bug --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ab237f768a9..3575080c992 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -138,7 +138,7 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) #Generate an empty \ - __init__.py to make framework_py_proto as a valid python module. + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) -- GitLab From c550e0ce0680b03a7625134eaa991083deaafc41 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 14:27:57 +0800 Subject: [PATCH 0227/2367] Add python interface for huber regression loss test=develop --- paddle/fluid/operators/huber_loss_op.cc | 7 +-- python/paddle/fluid/layers/nn.py | 69 +++++++++++++++++++++---- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 4ecd8634ff4..253b65a5f33 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OP_CPU_KERNEL( - huber_loss, - ops::HuberLossKernel); + huber_loss, ops::HuberLossKernel, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel, + ops::HuberLossGradKernel); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4df74edfceb..fb1ae7b753d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,7 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'huber_regression_loss', ] @@ -4595,7 +4596,7 @@ def hsigmoid(input, """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, or you can use is_custom to pass your own tree to + complete binary tree, or you can use is_custom to pass your own tree to implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each @@ -4611,7 +4612,7 @@ def hsigmoid(input, 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code + 4. now, each word should has its path and code along the path, you can pass a batch of path and code related to the same batch of inputs. @@ -4621,8 +4622,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, - it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, + it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num which indicates the num of classes using by binary classify. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid @@ -4635,15 +4636,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - path_table: (Variable|None) this variable can store each batch of samples' path to root, + path_table: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order - path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - path_code: (Variable|None) this variable can store each batch of samples' code, + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, each code consist with every code of parent nodes. it should be in leaf -> root order - is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is set you need to set path_table/path_code/num_classes, otherwise num_classes should be set - is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient + is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. Returns: @@ -8770,3 +8771,51 @@ def bilinear_tensor_product(x, # add activation return helper.append_activation(out) + + +def huber_regression_loss(input, label, delta): + """ + Huber regression loss is a loss function used in robust regression. + Huber regression loss can evaluate the fitness of input to label. + Different from MSE loss, Huber regression loss is more robust for outliers. + + When the difference between input and label is large than delta + .. math:: + + huber\_regression\_loss = delta * (label - input) - 0.5 * delta * delta + + When the difference between input and label is less than delta + .. math:: + + huber\_regression\_loss = 0.5 * (label - input) * (label - input) + + + Args: + input (Variable): This input is a probability computed by the previous operator. + The first dimension is batch size, and the last dimension is 1. + label (Variable): The groud truth whose first dimension is batch size + and last dimension is 1. + delta (float): The parameter of huber regression loss, which controls + the range of outliers + + Returns: + huber\_regression\_loss (Variable): The huber regression loss with shape [batch_size, 1]. + + Examples: + .. code-block:: python + + predictions = fluid.layers.softmax(x) + loss = fluid.layers.huber_regression_loss(input=predictions, label=label, 1.0) + """ + helper = LayerHelper('huber_regression_loss', **locals()) + residual = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + helper.append_op( + type='huber_loss', + inputs={'X': input, + 'Y': label}, + outputs={'Out': out, + 'Residual': residual}, + attrs={'delta': delta}) + return out -- GitLab From 5c7ad1ecb1971fad268dce0040d6e9d597dcef31 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 14:33:49 +0800 Subject: [PATCH 0228/2367] Resolve conflicts test=develop --- python/paddle/fluid/layers/nn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bc8e3e8a3c5..28cb700f82c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9179,6 +9179,7 @@ def psroi_pool(input, }) return out + def huber_regression_loss(input, label, delta): """ Huber regression loss is a loss function used in robust regression. -- GitLab From 09d669ba40aa900920dea84eb07aa868c44831b0 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 14:16:43 +0800 Subject: [PATCH 0229/2367] fix static_cast to const_cast --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/async_executor.cc | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ab237f768a9..3575080c992 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -138,7 +138,7 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) #Generate an empty \ - __init__.py to make framework_py_proto as a valid python module. + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index e2756cafa20..ee3c5e01f87 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -81,9 +81,8 @@ void AsyncExecutor::InitWorker(const std::string& dist_desc, int node_num, int index) { _pslib_ptr = std::shared_ptr( new paddle::distributed::PSlib()); - _pslib_ptr->init_worker(dist_desc, - static_cast(host_sign_list.data()), - node_num, index); + _pslib_ptr->init_worker( + dist_desc, const_cast(host_sign_list.data()), node_num, index); InitParamConfig(); } @@ -94,7 +93,7 @@ void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } void AsyncExecutor::GatherServers(const std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers(static_cast(host_sign_list.data()), + _pslib_ptr->gather_servers(const_cast(host_sign_list.data()), node_num); } -- GitLab From 04a570b4634f3cab2815cd1688df192f0f5b1d81 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 14 Dec 2018 14:59:35 +0800 Subject: [PATCH 0230/2367] Fix ut test=develop --- paddle/fluid/framework/data_type_test.cc | 2 +- paddle/fluid/framework/op_kernel_type_test.cc | 3 ++- paddle/fluid/inference/api/api_impl_tester.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 92639dfc611..2a380201f29 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -35,6 +35,6 @@ TEST(DataType, float16) { EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info - std::string type = "float16"; + std::string type = "::paddle::platform::float16"; EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); } diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index 3e17a512ce1..40db85400d2 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), - "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" + "CUDAPlace(0)]:library_" "type[CUDNN]"); } diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 191225493c3..78396397397 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == framework::proto::VarType::INT32) { + } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { -- GitLab From ab98101c2e3a71dd88aa9f8c9439f65293de2543 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 15:09:39 +0800 Subject: [PATCH 0231/2367] Update API.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8e6482ca981..c152a506d9e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -199,6 +199,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.huber_regression_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) -- GitLab From f702ab74b9edfe6310470ad1ad98ae054f3120fc Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 14 Dec 2018 07:36:45 +0000 Subject: [PATCH 0232/2367] add dist transpiler test --- .../tests/unittests/test_dist_transpiler.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc4..27575897b54 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -875,5 +875,53 @@ class TestRemoteNce(TestDistLookupTableBase): pass +# test for remote prefetch +class TestRemoteHsigmoid(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 10 + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + path_table = fluid.layers.data( + name='path_table', shape=[10], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[10], dtype='int64') + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='hs_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='hs_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.hsigmoid( + input=input, + label=label, + num_classes=non_leaf_num, + path_table=path_table, + path_code=path_code, + is_custom=True, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() -- GitLab From 8e785fec8d3d877db141f49cf95f557837b875d0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 16:01:51 +0000 Subject: [PATCH 0233/2367] clean code and refine tests template --- paddle/fluid/operators/jit/test.cc | 620 +++++++++++++---------------- 1 file changed, 276 insertions(+), 344 deletions(-) diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index d994a11f97d..62d4cdc19ae 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -57,31 +57,188 @@ std::vector TestSizes() { return s; } -template -void TestXYZNFunc(const typename KernelTuples::func_type tgt, +namespace jit = paddle::operators::jit; + +template +struct TestFuncWithRefer { + void operator()(const typename KernelTuples::func_type tgt, Args... args) {} +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector> { + void operator()(const typename jit::XYZNTuples::func_type tgt, const std::vector& x, const std::vector& y, const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + } +}; + +template +struct TestFuncWithRefer, T, std::vector, + std::vector> { + void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector> { + void operator()(const typename jit::XYNTuples::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector, std::vector, std::vector> { + void operator()(const typename jit::LSTMTuples::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename jit::LSTMTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + paddle::operators::jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector> { + void operator()(const typename jit::GRUTuples::func_type tgt, + const std::vector& xsrc, const std::vector& ht_1, + const std::vector& ht_ref, + const typename jit::GRUTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + paddle::operators::jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + } +}; + +template +void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { + TestFuncWithRefer test; + // test jitcode + auto jitcode = jit::GetJitCode(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel "; + test(jitcode, args...); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel "; + test(more, args...); + } + } + } + // test result from Get function + VLOG(10) << "Test Get function "; + auto tgt = jit::Get(attr); + test(tgt, args...); } template @@ -113,79 +270,11 @@ void TestXYZNKernel() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestXYZNFunc>(jitcode, x, y, zref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestXYZNFunc>(more, x, y, zref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestXYZNFunc>(tgt, x, y, zref); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector>(d, x, y, zref); } } -TEST(JITKernel, vmul) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vadd) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vaddrelu) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vsub) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -template -void TestAXYNFunc(const typename KernelTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); -} - template void TestAXYNKernel() { namespace jit = paddle::operators::jit; @@ -208,67 +297,11 @@ void TestAXYNKernel() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestAXYNFunc>(jitcode, a, x, yref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestAXYNFunc>(more, a, x, yref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestAXYNFunc>(tgt, a, x, yref); + TestAllImpls, PlaceType, T, std::vector, + std::vector>(d, a, x, yref); } } -TEST(JITKernel, vscal) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); -} - -TEST(JITKernel, vaddbias) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); -} - -template -void TestXYNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); -} - template void TestXYNKernel() { namespace jit = paddle::operators::jit; @@ -290,108 +323,11 @@ void TestXYNKernel() { ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestXYNFunc>(jitcode, x, yref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestXYNFunc>(more, x, yref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestXYNFunc>(tgt, x, yref); + TestAllImpls, PlaceType, std::vector, + std::vector>(d, x, yref); } } -TEST(JITKernel, vrelu) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, videntity) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vexp) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vsigmoid) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vtanh) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -template -void TestLSTMFunc(const typename KernelTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const paddle::operators::jit::lstm_attr_t& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - paddle::operators::jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); -} - template void TestLSTMKernel() { namespace jit = paddle::operators::jit; @@ -435,37 +371,10 @@ void TestLSTMKernel() { } ref(&step, &attr); - // test jitcode - auto jitcode = - jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel " << info; - TestLSTMFunc>(jitcode, xsrc, wp, ct_1, - ct_ref, ht_ref, attr); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel " << info; - TestLSTMFunc>(more, xsrc, wp, ct_1, - ct_ref, ht_ref, attr); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - TestLSTMFunc>(tgt, xsrc, wp, ct_1, ct_ref, - ht_ref, attr); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector, std::vector, + std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, + attr); } } } @@ -473,43 +382,6 @@ void TestLSTMKernel() { } } -TEST(JITKernel, lstmctht) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); -} - -TEST(JITKernel, lstmc1h1) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); -} - -template -void TestGRUFunc(const typename KernelTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const paddle::operators::jit::gru_attr_t& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - paddle::operators::jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); -} - template void TestGRUKernel() { namespace jit = paddle::operators::jit; @@ -538,37 +410,97 @@ void TestGRUKernel() { step.ht = ht_ref_data; ref(&step, &attr); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel " << info; - TestGRUFunc>(jitcode, xsrc, ht_1, ht_ref, attr); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel " << info; - TestGRUFunc>(more, xsrc, ht_1, ht_ref, attr); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - TestGRUFunc>(tgt, xsrc, ht_1, ht_ref, attr); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, + attr); } } } } +// XYZNTuple +TEST(JITKernel, vmul) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vadd) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vaddrelu) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vsub) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +// AXYNTuples +TEST(JITKernel, vscal) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, vaddbias) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +// XYNTuples +TEST(JITKernel, vrelu) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, videntity) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vexp) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vsigmoid) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vtanh) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +// LSTM +TEST(JITKernel, lstmctht) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +TEST(JITKernel, lstmc1h1) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +// GRU TEST(JITKernel, gruh1) { namespace jit = paddle::operators::jit; TestGRUKernel(); -- GitLab From 4a4ccac1d060ccf5758b7ff0d32dfb90ab3c5b7f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 14 Dec 2018 15:53:13 +0800 Subject: [PATCH 0234/2367] update by comment test=develop --- .../framework/details/all_reduce_op_handle.cc | 14 ++++++-------- .../framework/details/multi_devices_graph_pass.cc | 4 ++-- paddle/fluid/framework/details/op_handle_base.cc | 1 + .../details/threaded_ssa_graph_executor.cc | 1 + paddle/fluid/framework/parallel_executor.cc | 14 ++++++++++---- paddle/fluid/framework/threadpool.h | 1 + .../reader/create_double_buffer_reader_op.cc | 1 + paddle/fluid/platform/nccl_helper.h | 5 +---- .../unittests/test_parallel_executor_mnist.py | 2 +- 9 files changed, 24 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6b7bbf9003a..5a4f218077d 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); - if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); + // TODO(Yancey1989): synchronize here can get better performance + // if don't use NCCL group call, but need more profileing. + if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } this->RunAndRecordEvent([&] { - // TODO(Yancey1989): need allreduce operator to avoid this flag - if (nccl_ctxs_->need_group_call_) { + if (all_reduce_calls.size() == 1UL) { + all_reduce_calls[0](); + } else { platform::NCCLGroupGuard guard; for (auto &call : all_reduce_calls) { call(); } - } else { - // only used in executor_type == ParallalGraph, one thread one GPU - // TODO(Yancey1989): use allreduce operator to avoid this tricky. - PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); - all_reduce_calls[0](); } }); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 6e8cf86fcc9..5b82805ad93 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,8 +386,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } -// insert synchronous ops at the backpropagation; and -// insert synchronous ops if the graph contains mutilple places. +// insert collective ops at the backpropagation; and +// insert collective ops if the graph contains mutilple places. #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4914e0a5ad3..4822627ac3b 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif + RunImpl(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index cebf63364da..677a2937945 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } + VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2604e41045b..63f3ef0eacc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - bool need_group_call = true; if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. @@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor( } else { nccl_id = nccl_id_var->GetMutable(); } - need_group_call = false; } else if (nccl_id_var != nullptr) { // the other executor type. // the distributed training with nccl mode would initialize the nccl id in // startup_program. nccl_id = nccl_id_var->GetMutable(); } else { - // initlize NCCL by ncclCommInitAll, do not need nccl_id. + // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif + auto max_memory_size = GetEagerDeletionThreshold(); + // TODO(Yancey1989): fix gc failed on ParallelGraph executor. + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + graphs[0] = member_->PrepareGCAndRefCnts( + std::move(graphs[0]), static_cast(max_memory_size)); + } + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 8fd834be9ac..7a51d18fbbf 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { + struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 440b16cf915..ed719f91d09 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } + out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 23a0222239a..8d062dcdb47 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,15 +82,12 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; - bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0, - bool need_group_call = true) { + size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); - need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 0ff079b4e2c..fffe8bee580 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reducea + # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) -- GitLab From e90b2f104cbf4277e3cc55171e715e91f2512251 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 16:07:00 +0800 Subject: [PATCH 0235/2367] In most times, const_cast is bad and break interface contract and make the code unreadable and make the program unstable. test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 2 ++ paddle/scripts/paddle_build.sh | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327fc..f2ba75485c5 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { } CudnnRNNCache *cudnn_rnn_cache = nullptr; if (cache_var->IsInitialized()) { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); } else { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 6299b166af8..a1c1886c7f8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -517,6 +517,18 @@ function assert_api_spec_approvals() { fi fi done + + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast` + if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have at least 2 approvals for the const_cast" + exit 1 + fi + fi + } -- GitLab From cf5264629f914724f91e0a364adca4728b8dcc96 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 15:20:07 +0800 Subject: [PATCH 0236/2367] update API.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8e6482ca981..cfa28948e9e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -365,7 +365,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -- GitLab From 77907a35028053f68f95467650ac7878e58cc5aa Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 07:43:39 +0000 Subject: [PATCH 0237/2367] refine benchmark template --- paddle/fluid/operators/jit/benchmark.cc | 389 +++++------------------- paddle/fluid/operators/jit/helper.h | 14 + 2 files changed, 85 insertions(+), 318 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index ca636b020c2..4e5d530251e 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -51,251 +51,108 @@ std::vector TestSizes() { return s; } -// return this function avg time -template -double BenchXYZNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - std::vector& z) { // NOLINT - const T* x_data = x.data(); - const T* y_data = y.data(); - const int d = z.size(); - T* z_data = z.data(); - - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(x_data, y_data, z_data, d); +template +struct BenchFunc { + // return this function avg time + double operator()(const typename KernelTuples::func_type tgt, Args... args) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(args...); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(args...); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; + } +}; + +namespace jit = paddle::operators::jit; + +template +void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { + BenchFunc benchmark; + std::vector> infos; + // test refer + auto refer = jit::GetRefer(); + if (!refer) { + LOG(FATAL) << "Refer can not be empty!"; + } + infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); + + // test jitcode + auto jitcode = jit::GetJitCode(attr); + if (jitcode) { + infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + infos.push_back(std::make_pair("More", benchmark(more, args...))); + } + } } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(x_data, y_data, z_data, d); + // Test result from Get function + auto tgt = jit::Get(attr); + if (!tgt) { + LOG(FATAL) << "Target can not be empty!"; } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; + infos.push_back(std::make_pair("Target", benchmark(tgt, args...))); + + // print + std::ostringstream loginfos; + loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); } template void BenchXYZNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; std::vector x(d), y(d), z(d); RandomVec(d, x.data()); RandomVec(d, y.data()); - // refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchXYZNFunc>(refer, x, y, z); - infos.push_back(std::make_pair("Refer", res)); - } - - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchXYZNFunc>(jitcode, x, y, z); - infos.push_back(std::make_pair("JitCode", res)); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchXYZNFunc>(more, x, y, z); - infos.push_back(std::make_pair("More", res)); - } - } - } - - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchXYZNFunc>(tgt, x, y, z); - infos.push_back(std::make_pair("Target", res)); - - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(d, x.data(), y.data(), + z.data(), d); } } -// return this function avg time -template -double BenchAXYNFunc(const typename KernelTuples::func_type tgt, const T a, - const std::vector& x, - std::vector& y) { // NOLINT - const T* x_data = x.data(); - T* y_data = y.data(); - const int d = y.size(); - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(&a, x_data, y_data, d); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(&a, x_data, y_data, d); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchAXYNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; const T a = static_cast(3); std::vector x(d), y(d); RandomVec(d, x.data()); - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchAXYNFunc>(refer, a, x, y); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchAXYNFunc>(jitcode, a, x, y); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchAXYNFunc>(more, a, x, y); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchAXYNFunc>(tgt, a, x, y); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); - } -} - -// return this function avg time -template -double BenchXYNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, - std::vector& y) { // NOLINT - const T* x_data = x.data(); - T* y_data = y.data(); - const int d = y.size(); - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(x_data, y_data, d); + BenchAllImpls, PlaceType>(d, &a, x.data(), y.data(), + d); } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(x_data, y_data, d); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; } template void BenchXYNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; std::vector x(d), y(d); RandomVec(d, x.data()); - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchXYNFunc>(refer, x, y); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchXYNFunc>(jitcode, x, y); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchXYNFunc>(more, x, y); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchXYNFunc>(tgt, x, y); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(d, x.data(), y.data(), d); } } -// return this function avg time -template -double BenchLSTMFunc(const typename KernelTuples::func_type tgt, - const paddle::operators::jit::lstm_attr_t* attr, - paddle::operators::jit::lstm_t* step) { - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(step, attr); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(step, attr); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchLSTMKernel() { - namespace jit = paddle::operators::jit; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::vsigmoid, jit::vtanh, jit::vtanh, use_peephole); - std::vector> infos; std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); RandomVec(4 * d, x.data(), -2.f, 2.f); RandomVec(3 * d, wp.data(), -2.f, 2.f); @@ -315,77 +172,15 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchLSTMFunc>(refer, &attr, &step); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - auto res = BenchLSTMFunc>(jitcode, &attr, &step); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - auto res = BenchLSTMFunc>(more, &attr, &step); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchLSTMFunc>(tgt, &attr, &step); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) - << ", Sigmoid,Tanh,Tanh, " << (use_peephole ? "Peephole_" : "") - << " size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(attr, &step, &attr); } } } -// return this function avg time -template -double BenchGRUFunc(const typename KernelTuples::func_type tgt, - const paddle::operators::jit::gru_attr_t* attr, - paddle::operators::jit::gru_t* step) { - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(step, attr); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(step, attr); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchGRUKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::vsigmoid, jit::vtanh); - std::vector> infos; std::vector x(3 * d), ht_1(d), ht(d); RandomVec(3 * d, x.data(), -2.f, 2.f); RandomVec(d, ht_1.data(), -2.f, 2.f); @@ -396,50 +191,7 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchGRUFunc>(refer, &attr, &step); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - auto res = BenchGRUFunc>(jitcode, &attr, &step); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - auto res = BenchGRUFunc>(more, &attr, &step); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchGRUFunc>(tgt, &attr, &step); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", Sigmoid,Tanh, size " - << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(attr, &step, &attr); } } @@ -456,16 +208,17 @@ int main(int argc, char* argv[]) { << " times."; using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jit; + // xyzn BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); + // axyn BenchAXYNKernel(); BenchAXYNKernel(); - // act + // xyn BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 302e70caa7e..3431c22111f 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include "paddle/fluid/operators/jit/gen_base.h" @@ -124,6 +125,19 @@ const char* to_string(KernelType kt); KernelType to_kerneltype(const std::string& act); +inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { + os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) + << "],act_cand[" << to_string(attr.act_cand) << "],act_cell[" + << to_string(attr.act_cell) << "],use_peephole[" + << (attr.use_peephole ? "True" : "False") << "]"; + return os; +} +inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { + os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) + << "],act_cand[" << to_string(attr.act_cand) << "]"; + return os; +} + } // namespace jit } // namespace operators } // namespace paddle -- GitLab From 723f68727db273902674e6046ead5f0ebdb78bf4 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 14 Dec 2018 17:00:48 +0800 Subject: [PATCH 0238/2367] add ut about nce in transpiler --- .../fluid/tests/unittests/test_dist_transpiler.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc4..8abd7d9e0cf 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -870,9 +870,21 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + + out_vars = ["nce_w.block0", "nce_w.block1"] + in_vars = ["nce_b.block0", "nce_b.block1"] + + recv_var_names = [] + for op in trainer.blocks[0].ops: if op.type == "recv": - pass + for var in op.output("Out"): + recv_var_names.append(var) + + for out_var in out_vars: + self.assertFalse(out_var in recv_var_names) + for in_var in in_vars: + self.assertTrue(in_var in recv_var_names) if __name__ == "__main__": -- GitLab From ae17926987f1dcb2a8e75925047a542c49b589cf Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 09:22:28 +0000 Subject: [PATCH 0239/2367] enable jitkernel mkl vmul, vadd and vscal --- paddle/fluid/operators/jit/README.md | 2 + .../operators/jit/more/mkl/CMakeLists.txt | 2 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 68 ++++++++++++++++++- paddle/fluid/operators/jit/more/mkl/mkl.h | 31 ++++++--- .../fluid/operators/math/jit_kernel_blas.cc | 50 -------------- 5 files changed, 90 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 6b2f2b2848e..28d21f40af3 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -45,6 +45,8 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. +- (optional) 实现更多的算法在`more`目录下,可以依赖mkl,openblas,或者mkldnn等第三方库。 +- (optional) 实现基于Xbyak的生成code,在`gen`目下。 - 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 - 添加unit test,需要测试float和double - 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 0c15c7060d2..ffecb732975 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -4,3 +4,5 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE # use mkl kernels by name and type USE_JITKERNEL_MORE(vmul, mkl) +USE_JITKERNEL_MORE(vadd, mkl) +USE_JITKERNEL_MORE(vscal, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 0ffe1d565f1..3d963cbf1dd 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -13,7 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/mkl/mkl.h" +#include "paddle/fluid/operators/jit/refer/refer.h" #include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/dynload/mklml.h" namespace paddle { @@ -32,6 +34,61 @@ void VMul(const double* x, const double* y, double* z, int n) { platform::dynload::vdMul(n, x, y, z); } +template <> +void VAdd(const float* x, const float* y, float* z, int n) { + platform::dynload::vsAdd(n, x, y, z); +} + +template <> +void VAdd(const double* x, const double* y, double* z, int n) { + platform::dynload::vdAdd(n, x, y, z); +} + +template <> +void VScal(const float* a, const float* x, float* y, int n) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VScal(const double* a, const double* x, double* y, int n) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 +template <> +bool VMulKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VAddKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VScalKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::UseMe(int d) const { \ + return true; \ + } + +AWALYS_USE_ME_WITH_DOUBLE(VMul); +AWALYS_USE_ME_WITH_DOUBLE(VAdd); +AWALYS_USE_ME_WITH_DOUBLE(VScal); + +#undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl } // namespace more } // namespace jit @@ -40,5 +97,12 @@ void VMul(const double* x, const double* y, double* z, int n) { namespace mkl = paddle::operators::jit::more::mkl; -REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, - mkl::VMulKernel); +#define REGISTER_MKL_KERNEL(key, func) \ + REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ + mkl::func##Kernel) + +REGISTER_MKL_KERNEL(vmul, VMul); +REGISTER_MKL_KERNEL(vadd, VAdd); +REGISTER_MKL_KERNEL(vscal, VScal); + +#undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 4173d1f3de0..84a93f408f5 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -16,7 +16,6 @@ #include #include "paddle/fluid/operators/jit/kernel_base.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -28,17 +27,27 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl> { - public: - VMulKernel() { this->func = VMul; } - bool UseMe(int d) const override { - if (std::is_same::value) { - return platform::MayIUse(platform::avx512f) && d > 512; - } else { - return true; - } +void VAdd(const T* x, const T* y, T* z, int n); + +template +void VScal(const T* a, const T* x, T* y, int n); + +#define DECLARE_MKL_KERNEL(name, tuples) \ + template \ + class name##Kernel : public KernelImpl> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(typename tuples::attr_type) const override; \ } -}; + +// XYZN +DECLARE_MKL_KERNEL(VMul, XYZNTuples); +DECLARE_MKL_KERNEL(VAdd, XYZNTuples); + +// AXYN +DECLARE_MKL_KERNEL(VScal, AXYNTuples); + +#undef DECLARE_MKL_KERNEL } // namespace mkl } // namespace more diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 8cf588efba5..682e51e89d6 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -31,56 +31,6 @@ namespace operators { namespace math { namespace jitkernel { -#ifdef PADDLE_WITH_MKLML -template -void VMulMKL(const T* x, const T* y, T* z, int n); - -template <> -void VMulMKL(const float* x, const float* y, float* z, int n) { - platform::dynload::vsMul(n, x, y, z); -} - -template <> -void VMulMKL(const double* x, const double* y, double* z, int n) { - platform::dynload::vdMul(n, x, y, z); -} - -template -void VAddMKL(const T* x, const T* y, T* z, int n); - -template <> -void VAddMKL(const float* x, const float* y, float* z, int n) { - platform::dynload::vsAdd(n, x, y, z); -} - -template <> -void VAddMKL(const double* x, const double* y, double* z, int n) { - platform::dynload::vdAdd(n, x, y, z); -} - -template -void VScalMKL(const T* a, const T* x, T* y, int n); - -template <> -void VScalMKL(const float* a, const float* x, float* y, int n) { - if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -template <> -void VScalMKL(const double* a, const double* x, double* y, int n) { - if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -#endif - /* VMUL JitKernel */ template class VMulKernelImpl : public VMulKernel { -- GitLab From 5fff20c21a1199f560a4d014b5a81decf3fce9e9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 17:29:00 +0800 Subject: [PATCH 0240/2367] Change name to huber loss test=develop --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/layers/nn.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c152a506d9e..3ecc7af238e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -199,7 +199,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.huber_regression_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28cb700f82c..9922b3370d2 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -174,7 +174,7 @@ __all__ = [ 'get_tensor_from_selected_rows', 'lstm', 'psroi_pool', - 'huber_regression_loss', + 'huber_loss', ] kIgnoreIndex = -100 @@ -9180,7 +9180,7 @@ def psroi_pool(input, return out -def huber_regression_loss(input, label, delta): +def huber_loss(input, label, delta): """ Huber regression loss is a loss function used in robust regression. Huber regression loss can evaluate the fitness of input to label. @@ -9212,9 +9212,9 @@ def huber_regression_loss(input, label, delta): .. code-block:: python predictions = fluid.layers.softmax(x) - loss = fluid.layers.huber_regression_loss(input=predictions, label=label, 1.0) + loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0) """ - helper = LayerHelper('huber_regression_loss', **locals()) + helper = LayerHelper('huber_loss', **locals()) residual = helper.create_variable_for_type_inference( dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) -- GitLab From ea2a34ee957ded9d596c88360b929b273a39ceec Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 17:31:53 +0800 Subject: [PATCH 0241/2367] Polish doc test=develop --- python/paddle/fluid/layers/nn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9922b3370d2..776fd0104ab 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9182,19 +9182,19 @@ def psroi_pool(input, def huber_loss(input, label, delta): """ - Huber regression loss is a loss function used in robust regression. - Huber regression loss can evaluate the fitness of input to label. - Different from MSE loss, Huber regression loss is more robust for outliers. + Huber loss is a loss function used in robust. + Huber loss can evaluate the fitness of input to label. + Different from MSE loss, Huber loss is more robust for outliers. When the difference between input and label is large than delta .. math:: - huber\_regression\_loss = delta * (label - input) - 0.5 * delta * delta + huber\_loss = delta * (label - input) - 0.5 * delta * delta When the difference between input and label is less than delta .. math:: - huber\_regression\_loss = 0.5 * (label - input) * (label - input) + huber\_loss = 0.5 * (label - input) * (label - input) Args: @@ -9202,11 +9202,11 @@ def huber_loss(input, label, delta): The first dimension is batch size, and the last dimension is 1. label (Variable): The groud truth whose first dimension is batch size and last dimension is 1. - delta (float): The parameter of huber regression loss, which controls + delta (float): The parameter of huber loss, which controls the range of outliers Returns: - huber\_regression\_loss (Variable): The huber regression loss with shape [batch_size, 1]. + huber\_loss (Variable): The huber loss with shape [batch_size, 1]. Examples: .. code-block:: python -- GitLab From 5fea8cd47809f56ef232dd187f480c251898c762 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 18:26:33 +0800 Subject: [PATCH 0242/2367] Add sorted_result parameter to SelectedRows Functor test=develop --- .../operators/math/selected_rows_functor.cc | 17 ++++++++++------- .../operators/math/selected_rows_functor.cu | 3 ++- .../operators/math/selected_rows_functor.h | 16 ++++++---------- paddle/fluid/operators/optimizers/adam_op.h | 8 ++++++-- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 0c2e6d40241..1a11b584e2b 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -253,23 +253,26 @@ elementwise_add_to(const DeviceContext& ctx, BlasT* blas, template struct MergeAdd { framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { + const framework::SelectedRows& input, + const bool sorted_result = false) { framework::SelectedRows out; - (*this)(context, input, &out); + (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { std::vector inputs; inputs.push_back(&input); - (*this)(context, inputs, output); + (*this)(context, inputs, output, sorted_result); } void operator()(const platform::CPUDeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; @@ -302,8 +305,8 @@ struct MergeAdd { } std::vector merge_rows(merged_row_set.begin(), merged_row_set.end()); - if (sorted_result_) { - std::sort(merge_rows); + if (sorted_result) { + std::sort(merge_rows.begin(), merge_rows.end()); } std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index c4fccdbf862..b87c9461e88 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -266,7 +266,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { framework::SelectedRows operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input) { + const framework::SelectedRows& input, + const bool sorted_result = false) { framework::SelectedRows out; (*this)(context, input, &out); return out; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index b7b19f130e5..222d761ef91 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -78,23 +78,19 @@ namespace scatter { // functors for manuplating SelectedRows data template struct MergeAdd { - MergeAdd() : sorted_result_(false) {} - - explicit MergeAdd(bool sorted_result) : sorted_result_(sorted_result) {} - // unary functor, merge by adding duplicated rows in // the input SelectedRows object. framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input); + const framework::SelectedRows& input, + const bool sorted_result = false); void operator()(const DeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output); + framework::SelectedRows* output, + const bool sorted_result = false); void operator()(const DeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output); - - private: - bool sorted_result_; + framework::SelectedRows* output, + const bool sorted_result = false); }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index c2bf7040d77..c9e27b75472 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -157,6 +157,9 @@ struct AdamFunctor { } }; +template +struct SparseAdamFunctor; + template struct SparseAdamFunctor { T beta1_; @@ -283,6 +286,7 @@ struct SparseAdamFunctor { // Calculation if (i == *(rows_ + j)) { + T g = grad_[j * row_numel_]; mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; ++j; @@ -388,12 +392,12 @@ class AdamOpKernel : public framework::OpKernel { } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func(true); + scatter::MergeAdd merge_func; auto* grad_merge_var = const_cast(ctx.scope()) .Var() ->GetMutable(); merge_func(ctx.template device_context(), grad, - grad_merge_var); + grad_merge_var, true); grad_merge_ptr = grad_merge_var; } -- GitLab From 67b555d3d3c98d571dffe5b2b8e1c0bae59bd80d Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Fri, 14 Dec 2018 11:31:00 +0100 Subject: [PATCH 0243/2367] Enable ngraph tests for a ngraph engine (#14800) * Enable ngraph tests for a ngraph engine test=develop * Move the test structure to other place test=develop * Add USE_NGRAPH flag, simple structure test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 10 +++++++--- .../fluid/tests/unittests/ngraph/CMakeLists.txt | 6 ++++++ .../paddle/fluid/tests/unittests/ngraph/__init__.py | 13 +++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/ngraph/__init__.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a4089ba3ca0..6d6fe245d8a 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -63,9 +63,9 @@ function(py_test_modules TARGET_NAME) set(multiValueArgs MODULES DEPS ENVS) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} - ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} + ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (py_test_modules_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() @@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() + +if (WITH_NGRAPH) + add_subdirectory(ngraph) +endif() diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt new file mode 100644 index 00000000000..5ed2d0aa80c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_use_ngraph=true) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py new file mode 100644 index 00000000000..b94a21a7e40 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- GitLab From 5e97be7ba774c5b109c6674ba41d8ba1bfd18229 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 10:10:41 +0000 Subject: [PATCH 0244/2367] enable jitkernel mkl vexp, vsigmoid and vtanh --- paddle/fluid/operators/jit/gen/jitcode.h | 4 -- paddle/fluid/operators/jit/helper.h | 4 -- paddle/fluid/operators/jit/kernel_base.h | 1 + paddle/fluid/operators/jit/macro.h | 32 +++++++++++++++ .../operators/jit/more/mkl/CMakeLists.txt | 3 ++ paddle/fluid/operators/jit/more/mkl/mkl.cc | 31 ++++++++++++++ paddle/fluid/operators/jit/more/mkl/mkl.h | 33 +++++++++++++++ paddle/fluid/operators/jit/test.cc | 2 +- paddle/fluid/operators/math/jit_kernel_exp.cc | 41 ------------------- 9 files changed, 101 insertions(+), 50 deletions(-) create mode 100644 paddle/fluid/operators/jit/macro.h diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 765952fc352..64126e3f61a 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -56,10 +56,6 @@ typedef enum { identity } operand_type; -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 3431c22111f..44952fb9079 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -27,10 +27,6 @@ namespace paddle { namespace operators { namespace jit { -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - template inline typename std::enable_if< std::is_same::value && diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 00d583c60bf..f10d9f3fdd6 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/platform/macros.h" namespace paddle { diff --git a/paddle/fluid/operators/jit/macro.h b/paddle/fluid/operators/jit/macro.h new file mode 100644 index 00000000000..b2622eba8b7 --- /dev/null +++ b/paddle/fluid/operators/jit/macro.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace operators { +namespace jit { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index ffecb732975..3ecb520392e 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -6,3 +6,6 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE USE_JITKERNEL_MORE(vmul, mkl) USE_JITKERNEL_MORE(vadd, mkl) USE_JITKERNEL_MORE(vscal, mkl) +USE_JITKERNEL_MORE(vexp, mkl) +USE_JITKERNEL_MORE(vsigmoid, mkl) +USE_JITKERNEL_MORE(vtanh, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 3d963cbf1dd..42f6df576b1 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -62,6 +62,16 @@ void VScal(const double* a, const double* x, double* y, int n) { } } +template <> +void VExp(const float* x, float* y, int n) { + platform::dynload::vsExp(n, x, y); +} + +template <> +void VExp(const double* x, double* y, int n) { + platform::dynload::vdExp(n, x, y); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::UseMe(int d) const { @@ -78,6 +88,21 @@ bool VScalKernel::UseMe(int d) const { return platform::MayIUse(platform::avx512f) && d > 512; } +template <> +bool VExpKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VSigmoidKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VTanhKernel::UseMe(int d) const { + return d > 7; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(int d) const { \ @@ -87,6 +112,9 @@ bool VScalKernel::UseMe(int d) const { AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); +AWALYS_USE_ME_WITH_DOUBLE(VExp); +AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); +AWALYS_USE_ME_WITH_DOUBLE(VTanh); #undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl @@ -104,5 +132,8 @@ namespace mkl = paddle::operators::jit::more::mkl; REGISTER_MKL_KERNEL(vmul, VMul); REGISTER_MKL_KERNEL(vadd, VAdd); REGISTER_MKL_KERNEL(vscal, VScal); +REGISTER_MKL_KERNEL(vexp, VExp); +REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); +REGISTER_MKL_KERNEL(vtanh, VTanh); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 84a93f408f5..bf209d2f9d2 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -32,6 +32,34 @@ void VAdd(const T* x, const T* y, T* z, int n); template void VScal(const T* a, const T* x, T* y, int n); +template +void VExp(const T* x, T* y, int n); + +template +void VSigmoid(const T* x, T* y, int n) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + VExp(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template +void VTanh(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelImpl> { \ @@ -47,6 +75,11 @@ DECLARE_MKL_KERNEL(VAdd, XYZNTuples); // AXYN DECLARE_MKL_KERNEL(VScal, AXYNTuples); +// XYN +DECLARE_MKL_KERNEL(VExp, XYNTuples); +DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); +DECLARE_MKL_KERNEL(VTanh, XYNTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 62d4cdc19ae..e211276d189 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -312,7 +312,7 @@ void TestXYNKernel() { std::vector x(d), yref(d); std::vector xinp(d); // inplace test - RandomVec(d, x.data()); + RandomVec(d, x.data(), -2.f, 2.f); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 7945cfb253a..1f97ed1e62c 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -30,47 +30,6 @@ namespace operators { namespace math { namespace jitkernel { -#ifdef PADDLE_WITH_MKLML -// try to use MKL to speedup -template -void VExpMKL(const T* x, T* y, int n); - -template <> -void VExpMKL(const float* x, float* y, int n) { - platform::dynload::vsExp(n, x, y); -} - -template <> -void VExpMKL(const double* x, double* y, int n) { - platform::dynload::vdExp(n, x, y); -} - -template -void VSigmoidMKL(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExpMKL(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanhMKL(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoidMKL(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} -#endif - /* VExp JitKernel */ template class VExpKernelImpl : public VExpKernel { -- GitLab From a985949be99266a12003071bcadec2d9f7785d58 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 14 Dec 2018 19:21:40 +0800 Subject: [PATCH 0245/2367] Fea/fuse conv elementwise add fuse (#14669) --- paddle/fluid/framework/ir/CMakeLists.txt | 2 + .../ir/conv_elementwise_add2_act_fuse.cc | 106 ++++++++++++++++ .../ir/conv_elementwise_add2_act_fuse_pass.cc | 105 ++++++++++++++++ .../ir/conv_elementwise_add2_act_fuse_pass.h | 33 +++++ .../ir/conv_elementwise_add_act_fuse_pass.cc | 104 ++++++++++++++++ .../ir/conv_elementwise_add_act_fuse_pass.h | 33 +++++ .../framework/ir/graph_pattern_detector.cc | 113 +++++++++++++++++- .../framework/ir/graph_pattern_detector.h | 45 +++++++ .../api/analysis_predictor_tester.cc | 7 +- .../fluid/inference/api/paddle_pass_builder.h | 5 +- paddle/fluid/inference/io.cc | 2 +- .../inference/tests/api/trt_models_tester.cc | 25 +++- .../operators/controlflow/CMakeLists.txt | 2 +- paddle/fluid/operators/conv_op.cc | 4 +- paddle/fluid/platform/device_context.cc | 1 + 15 files changed, 580 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 883575e41db..be4151b54b6 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) +pass_library(conv_elementwise_add_act_fuse_pass inference) +pass_library(conv_elementwise_add2_act_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc new file mode 100644 index 00000000000..6e9905b7ecd --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc new file mode 100644 index 00000000000..23f343f6316 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, conv_op); // Input + IR_NODE_LINK_TO(conv_filter, conv_op); // Filter + IR_NODE_LINK_TO(conv_op, conv_out); // Output + IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h new file mode 100644 index 00000000000..3b40a5a9266 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAdd2ActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAdd2ActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc new file mode 100644 index 00000000000..fe3b4fca79f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& activation, const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = + PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAddActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h new file mode 100644 index 00000000000..ac69aa6458f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0118019df2f..bf12d12459c 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,6 +17,7 @@ #include #include +#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -25,6 +26,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" + namespace paddle { namespace framework { namespace ir { @@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { for (auto &node : GraphTraits::DFS(graph)) { for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; + VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name(); pdnodes2nodes_[pdnode.get()].insert(&node); } } @@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } + +std::unordered_set conv_act_set({"identity", "sigmoid", "relu", + "relu6", "relux", "tanh", + "band_pass"}); + +PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out}); + + return act_out; +} + +PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + elementwise_add_op_1->LinksFrom( + {elementwise_add_out, elementwise_add_in_y_1}); + act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); + return act_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d044802f22d..0fee2f1c185 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_y); PATTERN_DECL_NODE(elementwise_add_out); }; + +// Conv + ElementwiseAdd + an activation +// This pattern can futher fuse the conv related ops after the conv+bn fusion. +struct ConvElementwiseaddAct : public PatternBase { + ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd + ElementwiseAdd + Activation +struct ConvElementwiseadd2Act : public PatternBase { + ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, + "conv_elementwiseadd2_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(elementwise_add_op_1); + PATTERN_DECL_NODE(elementwise_add_in_y_1); // input + PATTERN_DECL_NODE(elementwise_add_out_1); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index d67305670c9..a361b34437a 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { - AnalysisConfig config(false); +#ifdef PADDLE_WITH_CUDA + AnalysisConfig config(true); + config.fraction_of_gpu_memory = 0.15; +#else + AnalysisConfig config; +#endif config.model_dir = FLAGS_dirname; config.enable_ir_optim = true; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index bc5139a7e54..e6e7de24783 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // }); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 24d15f12f9c..ae72a74acce 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, for (auto* var : global_block.AllVars()) { if (IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); + VLOG(4) << "persistable variable's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9eb3fb5da10..d3bd035c1c4 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { contrib::AnalysisConfig config(true); + config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestPrediction(reinterpret_cast(&config), @@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) { profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } +TEST(resnext50, compare_analysis_native) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, false /*use tensorrt*/); +} + TEST(TensorRT_mobilenet, analysis) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - compare(model_dir, /* use_tensorrt */ false); + compare(model_dir, false /* use_tensorrt */); +} + +TEST(AnalysisPredictor, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + AnalysisConfig config(true); + config.model_dir = model_dir; + config.fraction_of_gpu_memory = 0.15; + config.pass_builder()->TurnOnDebug(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto& input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + } } } // namespace inference diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b1c2ee22951..b614e9b0350 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,4 @@ include(operators) -register_operators() +register_operators(DEPS naive_executor) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d7b87662885..b09e527b905 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector dilations = ctx->Attrs().Get>("dilations"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, - "Conv intput should be 4-D or 5-D tensor."); + "Conv intput should be 4-D or 5-D tensor, get %u", + in_dims.size()); + PADDLE_ENFORCE_EQ( in_dims.size(), filter_dims.size(), "Conv input dimension and filter dimension should be the same."); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bd81d4dd1f1..d2e23d80f43 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- GitLab From 787d837f503a43f5bd2d8dfe5e5c2417a55084c7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 19:35:48 +0800 Subject: [PATCH 0246/2367] fix test=develop --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a1c1886c7f8..0fc43f33d09 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -518,7 +518,7 @@ function assert_api_spec_approvals() { fi done - HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast` + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` -- GitLab From fd0a954fbf081021b6b71c03d274ee3ea870ec6c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 11:47:43 +0000 Subject: [PATCH 0247/2367] enable blas jitcode vmul, vadd, vaddrelu, vscal and vaddbias --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 5 +++ paddle/fluid/operators/jit/gen/blas.cc | 38 +++++++++++++------ paddle/fluid/operators/jit/gen/blas.h | 25 +++++++++--- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 98d9231faa6..ef74a7118be 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -10,3 +10,8 @@ endfunction() # use gen jitcode kernel by name USE_JITKERNEL_GEN(vmul) +USE_JITKERNEL_GEN(vadd) +#USE_JITKERNEL_GEN(vsub) # TODO(TJ): enable me +USE_JITKERNEL_GEN(vaddrelu) +USE_JITKERNEL_GEN(vscal) +USE_JITKERNEL_GEN(vaddbias) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 3e5ce540647..b24f44c9f3b 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -104,18 +104,28 @@ void VXXJitCode::genCode() { ret(); } -class VMulCreator : public JitCodeCreator { - public: - bool UseMe(const int& attr) const override { - return platform::MayIUse(platform::avx); +#define DECLARE_BLAS_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override { \ + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ } - size_t CodeSize(const int& d) const override { - return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; - } - std::unique_ptr CreateJitCode(const int& attr) const override { - return make_unique(attr, CodeSize(attr)); - } -}; + +DECLARE_BLAS_CREATOR(VMul); +DECLARE_BLAS_CREATOR(VAdd); +DECLARE_BLAS_CREATOR(VSub); +DECLARE_BLAS_CREATOR(VAddRelu); +DECLARE_BLAS_CREATOR(VScal); +DECLARE_BLAS_CREATOR(VAddBias); + +#undef DECLARE_BLAS_CREATOR } // namespace gen } // namespace jit @@ -125,3 +135,9 @@ class VMulCreator : public JitCodeCreator { namespace gen = paddle::operators::jit::gen; REGISTER_JITKERNEL_GEN(vmul, gen::VMulCreator); +REGISTER_JITKERNEL_GEN(vadd, gen::VAddCreator); +// TODO(TJ): enable sub +// REGISTER_JITKERNEL_GEN(vsub, gen::VSubCreator); +REGISTER_JITKERNEL_GEN(vaddrelu, gen::VAddReluCreator); +REGISTER_JITKERNEL_GEN(vscal, gen::VScalCreator); +REGISTER_JITKERNEL_GEN(vaddbias, gen::VAddBiasCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index 60f32805678..5a2192052f8 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -15,6 +15,7 @@ #pragma once #include +#include "glog/logging.h" #include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { @@ -33,6 +34,9 @@ class VXXJitCode : public JitCode { type_(type), scalar_index_(scalar_index), with_relu_(with_relu) { + if (!(type_ == operand_type::mul || type_ == operand_type::add)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } this->genCode(); } @@ -78,11 +82,22 @@ class VXXJitCode : public JitCode { ymm_t ymm_zero = ymm_t(3); }; -class VMulJitCode : public VXXJitCode { - public: - explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr) - : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} -}; +#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu) \ + class name##JitCode : public VXXJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \ + } \ + }; + +DECLARE_BLAS_JITCODE(VMul, operand_type::mul, 0, false); +DECLARE_BLAS_JITCODE(VAdd, operand_type::add, 0, false); +DECLARE_BLAS_JITCODE(VSub, operand_type::sub, 0, false); +DECLARE_BLAS_JITCODE(VAddRelu, operand_type::add, 0, true); +DECLARE_BLAS_JITCODE(VScal, operand_type::mul, 1, false); +DECLARE_BLAS_JITCODE(VAddBias, operand_type::add, 1, false); + +#undef DECLARE_BLAS_JITCODE } // namespace gen } // namespace jit -- GitLab From 37c2e24511a29a2b23e18869b51f8edf805cead3 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 19:48:34 +0800 Subject: [PATCH 0248/2367] Update README.md --- README.md | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/README.md b/README.md index c535e9514e1..32a302cc543 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + + ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + + ## Features - **Flexibility** @@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. +## 特点 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **连接产品** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 -- GitLab From 0b1c7d838cfeb2e2000839f173a9be2d641f3d47 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 14 Dec 2018 20:01:00 +0800 Subject: [PATCH 0249/2367] Add brpc serialization support. (#11430) --- benchmark/fluid/fluid_benchmark.py | 4 +- cmake/external/brpc.cmake | 20 +- cmake/external/gtest.cmake | 10 +- cmake/external/leveldb.cmake | 4 +- paddle/fluid/framework/CMakeLists.txt | 9 +- paddle/fluid/framework/details/CMakeLists.txt | 11 +- paddle/fluid/framework/executor.cc | 6 +- .../operators/distributed/CMakeLists.txt | 31 +- .../operators/distributed/brpc_client.cc | 371 +++++++++++++++--- .../fluid/operators/distributed/brpc_client.h | 99 ++++- .../operators/distributed/brpc_rdma_pool.cc | 84 ++++ .../operators/distributed/brpc_rdma_pool.h | 56 +++ .../distributed/brpc_sendrecvop_utils.cc | 196 +++++++++ .../distributed/brpc_sendrecvop_utils.h | 49 +++ .../operators/distributed/brpc_serde_test.cc | 175 +++++++++ .../operators/distributed/brpc_server.cc | 264 +++++++++++-- .../distributed/brpc_variable_response.cc | 73 ++++ .../distributed/brpc_variable_response.h | 67 ++++ .../operators/distributed/grpc_client.cc | 3 +- .../fluid/operators/distributed/grpc_serde.cc | 7 - .../fluid/operators/distributed/rpc_server.h | 4 + .../operators/distributed/sendrecvop_utils.cc | 2 +- .../operators/distributed/sendrecvop_utils.h | 7 + .../operators/distributed_ops/CMakeLists.txt | 4 +- .../distributed_ops/listen_and_serv_op.cc | 7 +- .../operators/distributed_ops/send_op.cc | 2 + paddle/fluid/pybind/pybind.cc | 9 + python/paddle/fluid/__init__.py | 1 + 28 files changed, 1422 insertions(+), 153 deletions(-) create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.cc create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.h create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h create mode 100644 paddle/fluid/operators/distributed/brpc_serde_test.cc create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.cc create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.h diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 5f3ce300acc..10b633a4fc1 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") - config = distribute_transpiler.DistributeTranspilerConfig() + config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var + config.min_block_size = 1048576 t = distribute_transpiler.DistributeTranspiler(config=config) + t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 30b227b6452..6b50cff7a66 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -14,14 +14,16 @@ INCLUDE(ExternalProject) -find_library(SSL_LIBRARY NAMES ssl) +find_package(OpenSSL REQUIRED) + +message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY}) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) -find_library(CRYPTO_LIBRARY NAMES crypto) ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY}) - +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) @@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib") +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/gongweibao/brpc" - GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a" + GIT_TAG "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -50,7 +53,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_PREFIX_PATH=${prefix_path} - -DBRPC_WITH_GLOG=ON + -DWITH_GLOG=ON -DIOBUF_WITH_HUGE_BLOCK=ON -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} ${EXTERNAL_OPTIONAL_ARGS} @@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) +add_definitions(-DBRPC_WITH_GLOG) LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4fe9c13fb7f..9be625b6202 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(WITH_TESTING) - ENABLE_TESTING() +#FIXME:(gongwb) Move brpc's gtest dependency. +IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) + IF(WITH_TESTING) + ENABLE_TESTING() + ENDIF(WITH_TESTING) + INCLUDE(ExternalProject) SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest) @@ -76,4 +80,4 @@ IF(WITH_TESTING) ADD_DEPENDENCIES(gtest_main extern_gtest) LIST(APPEND external_project_dependencies gtest gtest_main) -ENDIF(WITH_TESTING) +ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index fb5091731da..0df61b01ab6 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -24,8 +24,8 @@ ExternalProject_Add( extern_leveldb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LEVELDB_SOURCES_DIR} - URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" - URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 CONFIGURE_COMMAND "" BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9e..cea4a448574 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -169,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) + + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + else() if(WITH_NGRAPH) if(NOT WIN32) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a927a3afcdd..97f7713d974 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) +if(WITH_DISTRIBUTE) + if(NOT WITH_GRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endif() +endif() + if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + ddim dynload_cuda selected_rows_functor sendrecvop_rpc) else() nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda selected_rows_functor) @@ -30,7 +37,7 @@ else() variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor sendrecvop_grpc) + ddim selected_rows_functor sendrecvop_rpc) else() cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0c4bd336c5b..8c3912120b5 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -157,9 +157,9 @@ void Executor::Close() { #ifdef PADDLE_WITH_DISTRIBUTE // TODO(typhoonzero): complete message will need to use real trainer_id, // except 0. - ::paddle::operators::distributed::RPCClient::GetInstance< - ::paddle::operators::distributed::GRPCClient>(0) - ->SendComplete(); + auto client = + paddle::operators::distributed::RPCClient::GetInstance(0); + client->SendComplete(); #endif } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 101dbe9c896..eab4297c737 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_GRPC) - grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows_functor memory) @@ -20,36 +20,43 @@ if(WITH_GRPC) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor selected_rows_functor scope math_function SERIAL) endif() - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() - set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc + collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor + proto_desc lookup_sparse_table_op snappystream snappy zlib) - cc_test(brpc_server_test SRCS rpc_server_test.cc + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${brpc_test_depends} SERIAL) cc_test(brpc_serde_test SRCS brpc_serde_test.cc DEPS ${brpc_test_depends} SERIAL) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) + endif() endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index 350969f74be..62e32977b8c 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -14,135 +14,316 @@ #include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace distributed { -DEFINE_int32(brpc_channel_num, 24, - "Number of channels to send requests connected to one server"); DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); BRPCClient::~BRPCClient() { Wait(); } -void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response) { +void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used by other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to send SendVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleSendResponse"; } -bool BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); + const std::string method = "SendRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage request; + distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, + &cntl->request_attachment(), "", false, + trainer_id_); - google::protobuf::Closure* done = - brpc::NewCallback(&HandleSendResponse, cntl, response); + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - sendrecv::VariableMessage request; - ch_ctx->stub->SendVariable(cntl, &request, response, done); - }); + platform::RecordRPCEvent record_event(method, p_ctx); + + ch_ctx->stub->SendVariable(cntl, &request, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } +void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { + LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleFetchBarrierResponse"; +} void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response) { + sendrecv::VariableMessage* response, VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, + BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to GetVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + cls->DecreaseReqCount(); + var_h->Finish(false); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; - // framework::Variable* outvar = nullptr; - // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + + framework::Variable* outvar = nullptr; + int trainer_id; + distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), + *var_h->ctx(), var_h->scope(), &outvar, + &trainer_id); + VLOG(4) << "Finish HandleGetResponse"; + cls->DecreaseReqCount(); + var_h->Finish(true); } -bool BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + const std::string method = "GetRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_trainer_id(trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method, p_ctx); + + if (method_name == "GetMonomerVariable") { + ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else { + ch_ctx->stub->GetVariable(cntl, &req, response, done); + } + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; +} + +VarHandlePtr BRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); } -bool BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + + const std::string method = "PrefetchRPC"; + + VarHandlePtr var_h( + new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(in_var_name_val); + sendrecv::VariableMessage req; + distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, + &cntl->request_attachment(), out_var_name_val, + false, 0, table_name_val); + + platform::RecordRPCEvent record_event(method, p_ctx); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] {}); + ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } -void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - req_count_++; +VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + time_out); } -void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + const std::string method = "FetchBarrierRPC"; + // var handle + VarHandlePtr var_h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + + platform::RecordRPCEvent record_event(method, nullptr); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->GetVariable(cntl, &req, response, done); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; } -void BRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +bool BRPCClient::Wait() { + VLOG(9) << "begin to brpcclient wait"; + { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + } + VLOG(9) << "end to brpcclient wait"; + return true; } ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + VLOG(4) << "begin to GetChannel:" << ep; { std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { + VLOG(4) << "end to GetChannel:" << ep; return it->second; } } @@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ChannelQueuePtr q(new framework::BlockingQueue()); brpc::ChannelOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.protocol = "baidu_std"; - options.connection_type = "pooled"; - options.connect_timeout_ms = 100; + // don't use pooled type. the server can't afford that. + options.connection_type = "single"; + options.connect_timeout_ms = 1000; options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; options.max_retry = FLAGS_max_retry; - for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + + VLOG(1) << "create " << brpc_channel_num_per_server_ + << " brpc channels to pserver:" << ep; + + for (int i = 0; i < brpc_channel_num_per_server_; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { LOG(FATAL) << "Fail to initialize channel"; @@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { channels_[ep] = q; } + VLOG(4) << "end to GetChannel:" << ep; return q; } +VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); +} + +void BRPCClient::SendComplete() { + for (auto& kv : channels_) { + AsyncSendComplete(kv.first); + } +} + +VarHandlePtr BRPCClient::AsyncSendVarMessage( + const std::string& ep, const std::string& method_name, + const sendrecv::VariableMessage& req, int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + platform::RecordRPCEvent record_event(method_name, nullptr); + + VarHandlePtr var_h( + new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + if (method_name == "CheckPointNotifyRPC") { + ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); + } else if (method_name == "GetMonomerBarrier") { + ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->SendVariable(cntl, &req, response, done); + } + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(message); + + return AsyncSendVarMessage(ep, method_name, req, time_out); +} + +VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); +} + } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h index 8ff1f0a6076..80cc81bff37 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -31,6 +31,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -53,33 +55,94 @@ class BRPCClient : public RPCClient { BRPCClient() {} virtual ~BRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; private: + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out = FLAGS_rpc_deadline); + void Proceed(); ChannelQueuePtr GetChannel(const std::string& ep); + VarHandlePtr AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, int64_t time_out); + + VarHandlePtr AsyncSendVarMessage(const std::string& ep, + const std::string& method_name, + const sendrecv::VariableMessage& req, + int64_t time_out); + + friend void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, + BRPCClient* cls); + void DecreaseReqCount() { + if (--req_count_ <= 0) { + sync_cond_.notify_all(); + } + } + private: std::unordered_map channels_; @@ -88,6 +151,8 @@ class BRPCClient : public RPCClient { std::condition_variable sync_cond_; std::atomic req_count_{0}; + static constexpr int brpc_channel_num_per_server_ = 4; + // mutex for GetChannel thread safety std::mutex chan_mutex_; DISABLE_COPY_AND_ASSIGN(BRPCClient); diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc new file mode 100644 index 00000000000..e1be5673dfb --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_BRPC_RDMA + +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "brpc/channel.h" +#include "brpc/rdma/rdma_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +RdmaMemPool& RdmaMemPool::Instance() { + static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); + return *g_rdma_mem_pool; +} + +void* RdmaMemPool::Find(const std::string& varname, int64_t size) { + pthread_rwlock_rdlock(&access_); + auto it = pool_.find(varname); + if (it == pool_.end()) { + pthread_rwlock_unlock(&access_); + return nullptr; + } + + auto info = it->second; + if (info.data_size != size) { + pthread_rwlock_unlock(&access_); + PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size, + info.data_size); + return nullptr; + } + + pthread_rwlock_unlock(&access_); + return info.data; +} + +void RdmaMemPool::Register(const std::string& varname, void* data, + int64_t data_size) { + void* old = Find(varname, data_size); + if (old != nullptr) { + if (data != old) { + PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old); + } + VLOG(7) << "Find on rdma:" << varname << " data:" << data + << " data_size:" << data_size; + return; + } + + VarInfo info; + info.data = data; + info.data_size = data_size; + + pthread_rwlock_wrlock(&access_); + pool_[varname] = info; + pthread_rwlock_unlock(&access_); + + if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { + LOG(FATAL) << "register " << varname << " data:" << data + << " data_size:" << data_size << " error"; + } + + VLOG(4) << "register on rdma:" << varname << " data:" << data + << " data_size:" << data_size; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h new file mode 100644 index 00000000000..156a93ec578 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_BRPC_RDMA + +#include // NOLINT +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +/* + * This class is used to avoid duplicated registion of brpc::rdma. + */ +class RdmaMemPool { + public: + static RdmaMemPool& Instance(); + RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} + + virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } + + void Register(const std::string& varname, void* data, int64_t size); + void* Find(const std::string& varname, int64_t size); + + private: + struct VarInfo { + void* data; + int64_t data_size; + + VarInfo() : data(nullptr), data_size(0) {} + }; + + private: + std::unordered_map pool_; + pthread_rwlock_t access_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc new file mode 100644 index 00000000000..6fed9ba92c1 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class IOBufWriter { + public: + static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + iobuf->append(v, vlen); + } + + static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, + int64_t vlen, bool in_cuda_pinned, + void (*destroy)(void*), void* user_data) { + VLOG(7) << "AppendTCPZeroCopy " + << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + // FIXME(gongwb): use append_zerocopy + /* + if (in_cuda_pinned) { + iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); + } else { + iobuf->append_zerocopy(v, vlen, nullptr); + } + */ + iobuf->append(v, vlen); + destroy(user_data); + } + +#ifdef PADDLE_WITH_BRPC_RDMA + static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + RdmaMemPool::Instance().Register( + varname, static_cast(const_cast(v)), vlen); + + // FIXME(gongwb): use append_zerocopy + // iobuf->append_zerocopy(v, vlen, nullptr); + iobuf->append(v, vlen); + destroy(user_data); + return; + } +#endif + + static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { +#ifdef PADDLE_WITH_BRPC_RDMA + IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, + destroy, user_data); +#else + IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, + user_data); +#endif + } +}; + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, int trainer_id, + const std::string& table_name) { + std::unique_ptr payload; + + request->set_varname(name); + request->set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request->set_profile(platform::kEnableProfiler); + } else { + request->set_profile(platform::kDisableProfiler); + } + } + if (!out_varname.empty()) { + request->set_out_varname(out_varname); + } + if (!table_name.empty()) { + request->set_table_name(table_name); + } + if (var->IsType()) { + request->set_type(::sendrecv::LOD_TENSOR); + payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); + } else if (var->IsType()) { + request->set_type(::sendrecv::SELECTED_ROWS); + payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request->set_type(::sendrecv::NCCL_ID); + const ncclUniqueId& uid = var->Get(); + // TODO(gongwb): use append_zero to avoid data copy. + IOBufWriter::Append(iobuf, + sendrecv::VariableMessage::kSerializedFieldNumber, + uid.internal, NCCL_UNIQUE_ID_BYTES); + return; +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + PADDLE_ENFORCE_NOT_NULL(payload); + + // FIXME(gongwb): it seems that can use zero copy. + if (var_is_not_stable) { + IOBufWriter::Append( + iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size()); + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + true, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); +#endif + } else { + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + false, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); + } + } + + if (var->IsType()) { + auto* slr = var->GetMutable(); + size_t rows_memory_size = + slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + + IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + reinterpret_cast(slr->rows().data()), + static_cast(rows_memory_size)); + } +} + +void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, + const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + operators::distributed::BRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h new file mode 100644 index 00000000000..ffaf4422242 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc new file mode 100644 index 00000000000..2a2dc72150a --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "brpc/channel.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 564 * 128; + + // serialize var to IOBuf + { + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // desrialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); + } +} + +void RunTestLodTensor(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 512 * 8 * 4 * 2; + { + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // check sendrecv::VariableMessage meta data + { + EXPECT_EQ(msg.varname(), "myvar"); + EXPECT_EQ(msg.type(), 0); + EXPECT_EQ(msg.dims()[0], 512); + EXPECT_EQ(msg.dims()[1], 8); + EXPECT_EQ(msg.dims()[2], 4); + EXPECT_EQ(msg.dims()[3], 2); + EXPECT_EQ(msg.lod_level(), 1); + EXPECT_EQ(msg.lod(0).lod_data(0), 1); + EXPECT_EQ(msg.lod(0).lod_data(1), 3); + EXPECT_EQ(msg.lod(0).lod_data(2), 8); + } + + // deserialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + for (int i = 0; i < tensor_numel; ++i) + EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + } +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc index 862167f0208..78d41aeac50 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -13,84 +13,287 @@ // limitations under the License. #include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { -typedef std::unordered_map +namespace distributed = paddle::operators::distributed; + +typedef std::unordered_map HandlerMap; class BRPCServiceImpl : public SendRecvService { public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) - : request_send_h_(nullptr), - request_get_h_(nullptr), - request_prefetch_h_(nullptr) { - auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, + distributed::RPCServer* rpc_server) + : rpc_server_(rpc_server) { + VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); + auto it = rpc_call_map.find(distributed::kRequestSend); if (it != rpc_call_map.end()) { request_send_h_ = it->second; + send_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestSend))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + it = rpc_call_map.find(distributed::kRequestGet); if (it != rpc_call_map.end()) { request_get_h_ = it->second; + get_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGet))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; + prefetch_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestCheckpoint); + if (it != rpc_call_map.end()) { + request_checkpoint_h_ = it->second; + checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); + if (it != rpc_call_map.end()) { + request_get_monomer_handler_h_ = it->second; + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); + if (it != rpc_call_map.end()) { + request_get_monomer_barrier_handler_h_ = it->second; } } virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VoidMessage* response, google::protobuf::Closure* done) override { + send_threads_->Run( + [=] { _SendVariable(cntl_butil, request, response, done); }); + } + + void _SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_send_h_ != nullptr, "RequestSend handler should be registed first!"); brpc::ClosureGuard done_guard(done); - - paddle::framework::Scope* local_scope = request_send_h_->scope(); - paddle::framework::Variable* outvar = nullptr; - paddle::framework::Variable* invar = nullptr; + brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + VLOG(3) << "RequestSend var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); - if (!request_send_h_->sync_mode()) { - local_scope = &request_send_h_->scope()->NewScope(); - invar = local_scope->Var(varname); - } else { - invar = local_scope->FindVar(varname); - } + distributed::BRPCVariableResponse resp(request_send_h_->scope(), + request_send_h_->dev_ctx(), + !request_send_h_->sync_mode()); + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); - request_send_h_->Handle(varname, local_scope, invar, &outvar); + auto scope = resp.GetMutableLocalScope(); + auto invar = resp.GetVar(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; - if (!request_send_h_->sync_mode()) { - request_send_h_->scope()->DeleteScope(local_scope); - } + request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); } void GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + get_threads_->Run( + [=] { _GetVariable(cntl_butil, request, response, done); }); + } + + void _GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_get_h_ != nullptr, "RequestGet handler should be registed first!"); - } + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGet varname:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + auto scope = request_get_h_->scope(); + auto invar = scope->FindVar(varname); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), + response, &cntl->response_attachment(), "", + false); + } + } void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + prefetch_threads_->Run( + [=] { _PrefetchVariable(cntl_butil, request, response, done); }); + } + + void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_prefetch_h_ != nullptr, "kRequestPrefetch handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // prefetch process... + std::string in_var_name = request->varname(); + std::string out_var_name = request->out_varname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << ", out_var_name: " << out_var_name + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp( + request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); + + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + std::string table_name = request->table_name(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = scope->Var(out_var_name); + + request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + distributed::SerializeToIOBuf(out_var_name, outvar, + *request_prefetch_h_->dev_ctx(), response, + &cntl->response_attachment(), "", true); + } + + void CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + checkpoint_notify_threads_->Run( + [=] { _CheckpointNotify(cntl_butil, request, response, done); }); + } + + void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE( + request_checkpoint_h_ != nullptr, + "kRequestCheckpointNotify handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), + request_checkpoint_h_->dev_ctx()); + + auto scope = resp.GetMutableLocalScope(); + + std::string checkpoint_notify = request->varname(); + std::string checkpoint_dir = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + } + + void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_handler_h_ != nullptr, + "kRequestGetMonomerVariable handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // proc request. + std::string varname = request->varname(); + VLOG(3) << "GetMonomerVariable " << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, + request->trainer_id()); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, + &cntl->response_attachment(), "", false); + } + } + + void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_barrier_handler_h_ != nullptr, + "RequestGetMonomerBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + paddle::framework::Scope* scope = nullptr; + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_barrier_handler_h_->Handle( + varname, scope, invar, &outvar, request->trainer_id()); } private: - paddle::operators::distributed::RequestHandler* request_send_h_; - paddle::operators::distributed::RequestHandler* request_get_h_; - paddle::operators::distributed::RequestHandler* request_prefetch_h_; + distributed::RequestHandler* request_send_h_{nullptr}; + distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_prefetch_h_{nullptr}; + distributed::RequestHandler* request_checkpoint_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; + + distributed::RPCServer* rpc_server_{nullptr}; + + // FIXME(gongwb): brpc should support process one rpce use one threadpool. + std::unique_ptr send_threads_; + std::unique_ptr get_threads_; + std::unique_ptr prefetch_threads_; + std::unique_ptr checkpoint_notify_threads_; }; } // namespace sendrecv @@ -100,7 +303,7 @@ namespace distributed { void AsyncBRPCServer::StartServer() { // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); // Add the service into server. Notice the second parameter, because the // service is put on stack, we don't want server to delete it, otherwise @@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() { } brpc::ServerOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.idle_timeout_sec = idle_timeout_s_; options.max_concurrency = max_concurrency_; if (server_.Start(bind_address_.c_str(), &options) != 0) { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc new file mode 100644 index 00000000000..75306d72334 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +namespace pb = ::google::protobuf; +using vr = ::sendrecv::VariableMessage; + +int BRPCVariableResponse::Parse(Source* source) { + pb::io::ZeroCopyInputStream* input_stream = source->contents(); + pb::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (1) { + unsigned int tag = 0; + if (!input.ReadLittleEndian32(&tag)) { + break; + } + + uint64_t num_bytes = 0; + if (!input.ReadLittleEndian64(&num_bytes)) { + break; + } + + int field = static_cast(tag); + int ret = field == 0 ? -1 : field; + switch (field) { + case vr::kSerializedFieldNumber: { + if (!ProcSerializedField(field, &input, num_bytes)) { + return ret; + } + break; + } + case vr::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return ret; + } + break; + } + default: { + PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field); + return ret; + } + } + } + + return 0; +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h new file mode 100644 index 00000000000..b0b91a42a01 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class BRPCSourceWrapper : public Source { + public: + explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return &source_; + } + + private: + butil::IOBufAsZeroCopyInputStream source_; +}; + +class BRPCVariableResponse : public VariableResponse { + public: + BRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~BRPCVariableResponse() {} + + // parse attachment from iobuf + int Parse(Source* source) override; + int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { + BRPCSourceWrapper wrapper(iobuf); + return VariableResponse::Parse(&wrapper, meta); + } +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index f14dfcdb238..78956c9ea49 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); const std::string method = "SendMonomerFetchBarrierRPC"; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 31fac2133cf..1f797ea91d3 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -32,13 +32,6 @@ namespace paddle { namespace operators { namespace distributed { -static void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index 45d1d3479ce..8c7b7f1d7ee 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -75,6 +75,10 @@ class RPCServer { void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num = 5); + int GetThreadNum(const std::string& rpc_name) { + return rpc_thread_num_[rpc_name]; + } + // Wait util all the clients have reached the barrier for one // rpc method. This function should be called in the // RequestHandler if you want to run the server/client in a diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 6ba883ba01f..5aadbcf220f 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" @@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor( memory::Copy(cuda_pinned, result->ptr(), boost::get(tensor.place()), tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); return TensorPayload(result); #else diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 523e56fe3e4..1a32ffdbeca 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -50,6 +50,13 @@ class TensorPayload final { size_t memory_size_; }; +inline void SerializeDestroyCallback(void* payload) { + if (payload != nullptr) { + auto* shared_payload = reinterpret_cast(payload); + delete shared_payload; + } +} + TensorPayload GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 28bb90af567..3c0b7ff24f9 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index ab92ad4506d..20870ea07eb 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -26,10 +26,11 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" +#include "paddle/fluid/platform/profiler.h" -DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send"); -DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get"); -DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch"); +DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); +DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get"); +DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch"); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 58a3ca82722..0bf4bebbc90 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase { } if (sync_send) { for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; } } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 74b4f2e937b..d590c3a3c6b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() { #endif } +bool IsCompiledWithBrpc() { +#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) + return true; +#else + return false; +#endif +} + bool IsCompiledWithDIST() { #ifdef PADDLE_WITH_DISTRIBUTE return true; @@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. [](bool init_p2p) { framework::InitDevices(init_p2p); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); #ifdef PADDLE_WITH_CUDA m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0bb0d1152b..2dea71d7af9 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -152,6 +152,7 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'selected_gpus' ] + core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) -- GitLab From 96216052d5ab74c367f439555da982be43f5b3ba Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 14 Dec 2018 12:24:02 +0000 Subject: [PATCH 0250/2367] 1. fix trt multi thread bug --- .../inference/tensorrt/convert/op_converter.h | 2 + .../fluid/operators/tensorrt/CMakeLists.txt | 2 +- .../operators/tensorrt/tensorrt_engine_op.cc | 4 +- .../tensorrt/tensorrt_engine_op.cu.cc | 24 --- .../operators/tensorrt/tensorrt_engine_op.h | 172 ++++++++---------- .../tensorrt/tensorrt_engine_op_test.cc | 2 - 6 files changed, 81 insertions(+), 125 deletions(-) delete mode 100644 paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d61d635ed70..91670ba8ac5 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -103,6 +103,7 @@ class OpConverter { void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); @@ -125,6 +126,7 @@ class OpConverter { std::unordered_map converters_; // fluid inference scope framework::Scope* scope_{nullptr}; + std::mutex mut_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index eee0b90fbae..6b551d13f1d 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -1,5 +1,5 @@ op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) -file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") +file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 3cf2ce3c7ef..f1ab59e3972 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -21,8 +21,6 @@ namespace paddle { -DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); - namespace operators { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { @@ -50,6 +48,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc deleted file mode 100644 index cbe1b426f65..00000000000 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - tensorrt_engine, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 6eef4c98c48..c19c315f798 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -27,8 +27,6 @@ namespace paddle { -DECLARE_int32(tensorrt_engine_batch_size); - namespace operators { using FluidDT = framework::proto::VarType_Type; @@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kINT32; } -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { PADDLE_ENFORCE_GT(shape.size(), 1UL, "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, @@ -63,131 +61,121 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { } // namespace // NOLINT using inference::Singleton; -using inference::tensorrt::TRT_EngineManager; +using inference::tensorrt::TensorRTEngine; + +class TensorRTEngineOp : public framework::OperatorBase { + private: + std::string engine_name_; + std::vector input_names_; + std::unordered_set param_names_; + mutable std::unique_ptr trt_engine_; + int max_batch_size_; + int workspace_size_; -class TensorRTEngineOp : public framework::OperatorWithKernel { public: - using framework::OperatorWithKernel::OperatorWithKernel; + TensorRTEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + engine_name_ = Attr("engine_uniq_key"); + input_names_ = Inputs("Xs"); + max_batch_size_ = Attr("max_batch_size"); + workspace_size_ = Attr("workspace_size"); + + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + } protected: - void InferShape(framework::InferShapeContext* ctx) const override {} - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input0 = ctx.Inputs("Xs").front(); - framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.scope() - .FindVar(input0) - ->GetMutable() - ->type()), - ctx.GetPlace()); - return kt; + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunTrt(scope, dev_place); } -}; -template -class TensorRTEngineKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto engine_name = context.Attr("engine_uniq_key"); - int max_batch_size = context.Attr("max_batch_size"); - if (!Singleton::Global().HasEngine(engine_name)) { - Prepare(context); + void RunTrt(const framework::Scope &scope, + const platform::Place &dev_place) const { + int runtime_batch = 1; + if (trt_engine_.get() == nullptr) { + trt_engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device)); + Prepare(scope, dev_place, trt_engine_.get()); } - auto* engine = Singleton::Global().Get(engine_name); - auto input_names = context.op().Inputs("Xs"); - PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); + + auto *engine = trt_engine_.get(); + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = - context.Attr>("output_name_mapping"); + Attr>("output_name_mapping"); - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } // Convert input tensor from fluid to engine. - for (const auto& x : context.Inputs("Xs")) { - if (parameters.count(x)) continue; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer - auto& t = inference::analysis::GetFromScope( - context.scope(), x); + auto &t = + inference::analysis::GetFromScope(scope, x); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), + engine->SetInputFromCPU(x, static_cast(t.data()), t.memory_size()); } else { - engine->SetInputFromGPU(x, static_cast(t.data()), + engine->SetInputFromGPU(x, static_cast(t.data()), t.memory_size()); } } + + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. - PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); - engine->Execute(FLAGS_tensorrt_engine_batch_size); + engine->Execute(runtime_batch); // Convert output tensor from engine to fluid int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; - for (const auto& y : context.Outputs("Ys")) { + for (const auto &y : Outputs("Ys")) { VLOG(4) << y; // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); + nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; - ddim.push_back(FLAGS_tensorrt_engine_batch_size); + ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto* fluid_v = context.scope().FindVar(y); + auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); - auto* fluid_t = fluid_v->GetMutable(); + auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) find some way to determine which device to output the - // tensor. - // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * - FLAGS_tensorrt_engine_batch_size; + auto size = + inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; engine->GetOutputInGPU( output_maps[output_index], fluid_t->mutable_data(platform::CUDAPlace( - boost::get(context.GetPlace()).device)), + boost::get(dev_place).device)), size * sizeof(float)); - output_index += 1; } cudaStreamSynchronize(*engine->stream()); } - protected: - void Prepare(const framework::ExecutionContext& context) const { + void Prepare(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { VLOG(4) << "Prepare engine"; - // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch_size = context.Attr("max_batch_size"); - int workspace_size = context.Attr("workspace_size"); - - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } + block_desc.ParseFromString(Attr("subgraph")); std::vector output_maps = - context.Attr>("output_name_mapping"); - - // TODO(Superjomn) replace this with a different stream - auto* engine = Singleton::Global().Create( - max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, - context.Attr("engine_uniq_key"), - boost::get(context.GetPlace()).device); + Attr>("output_name_mapping"); engine->InitNetwork(); @@ -195,39 +183,33 @@ class TensorRTEngineKernel : public framework::OpKernel { VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; - for (auto& input : context.Inputs("Xs")) { - if (parameters.count(input)) continue; + for (auto &input : Inputs("Xs")) { + if (param_names_.count(input)) continue; VLOG(4) << "declare input " << input; - auto* var = block.FindVar(input); + + auto &t = + inference::analysis::GetFromScope(scope, input); + auto t_shape = framework::vectorize(t.dims()); + + auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description // should be set in PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); - auto shape = var->GetShape(); - // For the special batch_size placeholder -1, drop it and pass the real - // shape of data. - // TODO(Superjomn) fix this with batch broadcast, or it can't handle - // variational batch size. - if (shape[0] == -1) { - shape[0] = FLAGS_tensorrt_engine_batch_size; - } + engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(shape)); + Vec2TRT_Dims(t_shape)); } - inference::Singleton::Global() - .ConvertBlock(block_desc, parameters, context.scope(), engine); + .ConvertBlock(block_desc, param_names_, scope, engine); // Add outputs - for (auto& output : output_maps) { - if (!engine->HasDeclared(output)) { - engine->DeclareOutput(output); - } + for (auto &output : output_maps) { + engine->DeclareOutput(output); } - engine->FreezeNetwork(); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 56bdd6c2f28..6f8adb00edd 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_CUDA_ONLY_OP(tensorrt_engine); - namespace paddle { namespace operators { -- GitLab From 80766bcb829cab075ea7c1746d2493f6fc81b422 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 11:51:33 +0000 Subject: [PATCH 0251/2367] enable act jitcode vexp, vrelu, vsigmoid and vtanh --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 5 + paddle/fluid/operators/jit/gen/act.cc | 134 ++++++++ paddle/fluid/operators/jit/gen/act.h | 312 ++++++++++++++++++ 3 files changed, 451 insertions(+) create mode 100644 paddle/fluid/operators/jit/gen/act.cc create mode 100644 paddle/fluid/operators/jit/gen/act.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index ef74a7118be..2be750a4d86 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -15,3 +15,8 @@ USE_JITKERNEL_GEN(vadd) USE_JITKERNEL_GEN(vaddrelu) USE_JITKERNEL_GEN(vscal) USE_JITKERNEL_GEN(vaddbias) +USE_JITKERNEL_GEN(vrelu) +USE_JITKERNEL_GEN(videntity) +USE_JITKERNEL_GEN(vexp) +USE_JITKERNEL_GEN(vsigmoid) +USE_JITKERNEL_GEN(vtanh) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc new file mode 100644 index 00000000000..f3332cbefa7 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; + +const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; +int g_tmp_mem[16] ALIGN32 = {0}; + +void VActJitCode::genCode() { + int offset = 0; + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + vmovups(ymm_src, ptr[param1 + offset]); + act(ymm_dst, ymm_src, type_); + vmovups(ptr[param2 + offset], ymm_dst); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + int rest = num_ % YMM_FLOAT_BLOCK; + while (rest > 0) { + int block = XMM_FLOAT_BLOCK; + if (rest >= 4) { + block = 4; + vmovups(xmm_src, ptr[param1 + offset]); + } else if (rest >= 2) { + block = 2; + vmovq(xmm_src, ptr[param1 + offset]); + } else { + block = 1; + vmovss(xmm_src, ptr[param1 + offset]); + } + act(xmm_dst, xmm_src, type_); + if (rest >= 4) { + vmovups(ptr[param2 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param2 + offset], xmm_dst); + } else { + vmovss(ptr[param2 + offset], xmm_dst); + } + offset += sizeof(float) * block; + rest -= block; + } + ret(); +} + +#define DECLARE_ACT_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override; \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_ACT_CREATOR(VRelu); +DECLARE_ACT_CREATOR(VIdentity); +DECLARE_ACT_CREATOR(VExp); +DECLARE_ACT_CREATOR(VSigmoid); +DECLARE_ACT_CREATOR(VTanh); + +// TODO(TJ): tuning use me +size_t VReluCreator::CodeSize(const int& d) const { + return 96 /* init size */ + + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * + 8 /* average bytes for each instruction */; +} + +size_t VIdentityCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8; +} + +size_t VExpCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8; +} + +size_t VSigmoidCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8; +} + +size_t VTanhCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8; +} + +#undef DECLARE_ACT_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(vrelu, gen::VReluCreator); +REGISTER_JITKERNEL_GEN(videntity, gen::VIdentityCreator); +REGISTER_JITKERNEL_GEN(vexp, gen::VExpCreator); +REGISTER_JITKERNEL_GEN(vsigmoid, gen::VSigmoidCreator); +REGISTER_JITKERNEL_GEN(vtanh, gen::VTanhCreator); diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h new file mode 100644 index 00000000000..63dee7bc0dd --- /dev/null +++ b/paddle/fluid/operators/jit/gen/act.h @@ -0,0 +1,312 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +extern const float exp_float_consts[]; +extern const int exp_int_0x7f[]; +extern int g_tmp_mem[]; + +#define ALIGN32 __attribute__((aligned(32))) +#define EXP_HIG 88.3762626647949f +#define EXP_LOW -88.3762626647949f +#define CEPHES_LOG2EF 1.44269504088896341 +#define CEPHES_EXP_C1 0.693359375 +#define CEPHES_EXP_C2 -2.12194440e-4 +#define CEPHES_EXP_P0 1.9875691500E-4 +#define CEPHES_EXP_P1 1.3981999507E-3 +#define CEPHES_EXP_P2 8.3334519073E-3 +#define CEPHES_EXP_P3 4.1665795894E-2 +#define CEPHES_EXP_P4 1.6666665459E-1 +#define CEPHES_EXP_P5 5.0000001201E-1 + +#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val + +#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) + +class VActJitCode : public JitCode { + public: + explicit VActJitCode(int d, operand_type type, size_t code_size, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d), type_(type) { + if (!(type_ == operand_type::relu || type_ == operand_type::exp || + type_ == operand_type::sigmoid || type_ == operand_type::tanh || + type_ == operand_type::identity)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } + this->genCode(); + } + + const char* name() const override { + std::string base = "VActJitCode"; + switch (type_) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + return base.c_str(); + } + void genCode() override; + + protected: + // compute relu with ymm, xmm + template + void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); + vmaxps(dst, src, zero); + } + + // compute exp with ymm, xmm + template + void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT + int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { + using namespace platform; // NOLINT + // check all idx can not equal + JMM jmm_src = JMM(src_idx); + JMM jmm_fx = JMM(fx_idx); + JMM jmm_fy = JMM(fy_idx); + JMM jmm_mask = JMM(mask_idx); + JMM jmm_tmp = JMM(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); + vminps(jmm_src, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); + vmaxps(jmm_src, jmm_src, jmm_tmp); + // express exp(x) as exp(g + n*log(2)) + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); + vmulps(jmm_fx, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); + vaddps(jmm_fx, jmm_fx, jmm_tmp); + vroundps(jmm_fy, jmm_fx, 0x01); + // if greater, substract 1 + vcmpgtps(jmm_mask, jmm_fy, jmm_fx); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vandps(jmm_mask, jmm_mask, jmm_tmp); + vsubps(jmm_fx, jmm_fy, jmm_mask); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); + vmulps(jmm_fy, jmm_fx, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); + JMM ymm_z = JMM(jmm_mask.getIdx()); + vmulps(ymm_z, jmm_fx, jmm_tmp); + vsubps(jmm_src, jmm_src, jmm_fy); + vsubps(jmm_src, jmm_src, ymm_z); + vmulps(ymm_z, jmm_src, jmm_src); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); + vmulps(dst, jmm_src, jmm_tmp); + for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; + i += (YMM_FLOAT_BLOCK * sizeof(float))) { + vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, jmm_src); + } + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, ymm_z); + vaddps(dst, dst, jmm_src); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vaddps(dst, dst, jmm_tmp); + // build 2^n + JMM ymm_int = jmm_fx; + vcvttps2dq(ymm_int, jmm_fx); + mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); + vmovdqa(jmm_tmp, ptr[reg_ptr_global]); + if (MayIUse(avx2) || std::is_same::value) { + vpaddd(ymm_int, ymm_int, jmm_tmp); + vpslld(ymm_int, ymm_int, 23); + } else if (MayIUse(avx)) { + xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); + xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx()); + reg64_t reg_ptr_tmp = reg_ptr_global; + mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); + vmovdqa(ptr[reg_ptr_tmp], ymm_int); + vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp], xtmp1); + // next 128bits + vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]); + vmovdqa(xtmp2, ptr[reg_ptr_tmp + + (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1); + // load out + vmovdqa(ymm_int, ptr[reg_ptr_tmp]); + } + vmulps(dst, dst, ymm_int); + pop(reg_ptr_global); + } + + // compute sigmoid with ymm, xmm + template + void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { + // y = 1 / (1 + e^-x) + JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_src = JMM(src_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); + vminps(jmm_src, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); + vmaxps(jmm_src, jmm_src, jmm_tmp); + vxorps(jmm_tmp, jmm_tmp, jmm_tmp); + vsubps(jmm_src, jmm_tmp, jmm_src); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vdivps(dst, jmm_tmp, dst); + pop(reg_ptr_global); + } + + // compute tanh with ymm, xmm + template + void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { + // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_src = JMM(src_idx); + JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_zero = JMM(mask_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vxorps(jmm_zero, jmm_zero, jmm_zero); + vsubps(jmm_tmp, jmm_zero, jmm_tmp); + vmulps(jmm_src, jmm_src, jmm_tmp); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vdivps(dst, jmm_tmp, dst); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vsubps(dst, dst, jmm_tmp); + pop(reg_ptr_global); + } + + // compute identity with ymm, xmm + template + void identity_jmm(JMM& dst, JMM& src, int zero_idx) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); + vaddps(dst, src, zero); + // TODO(TJ): use below + // dst.setIdx(src.getIdx()); + } + + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 11~15 + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, 15); + break; + case operand_type::exp: + exp_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::identity: + identity_jmm(dst, src, 15); + break; + default: + LOG(FATAL) << "Do not support this operand type: " << type_; + break; + } + } + + protected: + int num_; + operand_type type_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + + xmm_t xmm_src = xmm_t(0); + ymm_t ymm_src = ymm_t(0); + + xmm_t xmm_dst = xmm_t(1); + ymm_t ymm_dst = ymm_t(1); +}; + +#define DECLARE_ACT_JITCODE(name, op_type) \ + class name##JitCode : public VActJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : VActJitCode(d, op_type, code_size, code_ptr) {} \ + }; + +DECLARE_ACT_JITCODE(VRelu, operand_type::relu); +DECLARE_ACT_JITCODE(VIdentity, operand_type::identity); +DECLARE_ACT_JITCODE(VExp, operand_type::exp); +DECLARE_ACT_JITCODE(VSigmoid, operand_type::sigmoid); +DECLARE_ACT_JITCODE(VTanh, operand_type::tanh); + +#undef DECLARE_ACT_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle -- GitLab From 514648665ae7093f4531579246e5fec7c1849f07 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sat, 15 Dec 2018 08:01:27 +0000 Subject: [PATCH 0252/2367] fix trt_op test test=develop --- paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 6f8adb00edd..287b0edc96e 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" +USE_NO_KERNEL_OP(tensorrt_engine); namespace paddle { namespace operators { -- GitLab From 2ebf12f340a82e1512f5f889d37b41e76b9eb3f7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 09:52:58 +0800 Subject: [PATCH 0253/2367] fix test=develop --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0fc43f33d09..a0da89d3195 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -518,7 +518,7 @@ function assert_api_spec_approvals() { fi done - HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true` + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` -- GitLab From 4e4a777243fe023f08959602b702bfec1babf469 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sun, 16 Dec 2018 06:32:46 +0000 Subject: [PATCH 0254/2367] add conv+elementwiseadd pass test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/graph_pattern_detector.cc | 28 ++++++++++++++++++- .../framework/ir/graph_pattern_detector.h | 18 ++++++++++++ .../fluid/inference/api/paddle_pass_builder.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index be4151b54b6..b7f7e2ee8ef 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) +pass_library(conv_elementwise_add_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index bf12d12459c..13d752e5167 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,7 +17,6 @@ #include #include -#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { return act_out; } +PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + + return elementwise_add_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0fee2f1c185..eaedd9d08e0 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase { PATTERN_DECL_NODE(act_out); }; +// Conv + ElementwiseAdd +// This pattern should be used after ConvElementwiseadd2Act or +// ConvElementwiseadd pass +struct ConvElementwiseadd : public PatternBase { + ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index e6e7de24783..40ca0d287cc 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } -- GitLab From f2b92d77b59f6bcb55f33ee69d640b9b9b77c348 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:02:44 +0800 Subject: [PATCH 0255/2367] remove clock time in WIN32 mode --- paddle/fluid/framework/async_executor.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index a82e9415596..95c8472b2f3 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -34,9 +34,13 @@ namespace paddle { namespace framework { inline double current_realtime() { +#if !defined(_WIN32) struct timespec tp; clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; +#else + return 0.0; +#endif } inline std::default_random_engine& local_random_engine() { -- GitLab From 66522046ad9c8659b80dc3be6d0c50c3d56f17fa Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:02:44 +0800 Subject: [PATCH 0256/2367] remove clock time in WIN32 mode test=develop --- paddle/fluid/framework/async_executor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 95c8472b2f3..7accc4cb57b 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -39,7 +39,7 @@ inline double current_realtime() { clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; #else - return 0.0; + return 0; #endif } -- GitLab From 4c0a769d1d70f2d4f86f3369d65eb2fb6bd6981f Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:16:14 +0800 Subject: [PATCH 0257/2367] avoid clock time in WIN32 mode test=develop --- paddle/fluid/framework/async_executor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 7accc4cb57b..95c8472b2f3 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -39,7 +39,7 @@ inline double current_realtime() { clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; #else - return 0; + return 0.0; #endif } -- GitLab From c0c9fcd9c721fdadfb8412e3e47d207bf5e4766e Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sun, 16 Dec 2018 13:02:33 +0000 Subject: [PATCH 0258/2367] add source file test=develop --- .../ir/conv_elementwise_add_fuse_pass.cc | 91 +++++++++++++++++++ .../ir/conv_elementwise_add_fuse_pass.h | 33 +++++++ 2 files changed, 124 insertions(+) create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc new file mode 100644 index 00000000000..476c9dbc353 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); + +std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string output_name = elementwise_add_out->Name(); + + std::string act_type = "identity"; + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType("conv2d_fusion"); + new_op_desc.SetInput("Bias", {bias_name}); + new_op_desc.SetInput("ResidualData", {}); + new_op_desc.SetAttr("activation", act_type); + new_op_desc.SetOutput("Output", {output_name}); + new_op_desc.SetAttr("is_test", true); + new_op_desc.SetAttr("use_cudnn", false); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_fuse_pass, + paddle::framework::ir::ConvElementwiseAddFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h new file mode 100644 index 00000000000..f234603f585 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle -- GitLab From 6445cf1e91f6e9ac169f6834d4b3471136d9bd38 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 21:22:03 +0800 Subject: [PATCH 0259/2367] fix test=develop --- python/paddle/fluid/tests/book/test_recognize_digits.py | 2 +- python/paddle/fluid/tests/book/test_word2vec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 54936519ce0..3b2c4af8ae5 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -260,7 +260,7 @@ def inject_all_tests(): for use_cuda in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - for parallel in (False, True): + for parallel in (False, ): for nn_type in ('mlp', 'conv'): inject_test_method(use_cuda, parallel, nn_type, True) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 08f70c9cabc..e24a9aa989b 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -250,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): for use_cuda in (False, True): for is_sparse in (False, True): - for is_parallel in (False, True): + for is_parallel in (False, ): inject_test_method(use_cuda, is_sparse, is_parallel) if __name__ == '__main__': -- GitLab From 8a6b53a4943f671b456a2ab20d85e62c83f56ed6 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:16:14 +0800 Subject: [PATCH 0260/2367] avoid clock time in WIN32 mode test=develop --- python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements.txt b/python/requirements.txt index 5d64674fe02..36313333b2b 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,4 +9,4 @@ Pillow nltk>=3.2.2 graphviz six -mpi4py=3.0.0 +mpi4py==3.0.0 -- GitLab From 7c1f3ad6eb18d40310e8933a937ef83b3342a532 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 14:13:44 +0000 Subject: [PATCH 0261/2367] enable jitcode lstm --- paddle/fluid/operators/jit/README.md | 2 +- paddle/fluid/operators/jit/gen/CMakeLists.txt | 2 + paddle/fluid/operators/jit/gen/jitcode.h | 2 +- paddle/fluid/operators/jit/gen/lstm.cc | 142 ++++++++++++++++++ paddle/fluid/operators/jit/gen/lstm.h | 119 +++++++++++++++ paddle/fluid/operators/jit/test.cc | 10 +- 6 files changed, 268 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/operators/jit/gen/lstm.cc create mode 100644 paddle/fluid/operators/jit/gen/lstm.h diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 28d21f40af3..ce31f18b63c 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -46,7 +46,7 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. - (optional) 实现更多的算法在`more`目录下,可以依赖mkl,openblas,或者mkldnn等第三方库。 -- (optional) 实现基于Xbyak的生成code,在`gen`目下。 +- (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在KernelType上。 - 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 - 添加unit test,需要测试float和double - 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 2be750a4d86..81a6314bd25 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -20,3 +20,5 @@ USE_JITKERNEL_GEN(videntity) USE_JITKERNEL_GEN(vexp) USE_JITKERNEL_GEN(vsigmoid) USE_JITKERNEL_GEN(vtanh) +USE_JITKERNEL_GEN(lstmctht) +USE_JITKERNEL_GEN(lstmc1h1) diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 64126e3f61a..898d7df3451 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -62,7 +62,7 @@ typedef enum { class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) - : Xbyak::CodeGenerator(code_size, code_ptr) {} + : Xbyak::CodeGenerator((code_size < 4096 ? 4096 : code_size), code_ptr) {} virtual const char* name() const = 0; virtual void genCode() = 0; diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc new file mode 100644 index 00000000000..7e5a7773f83 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/lstm.h" +#include // offsetof +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void LSTMJitCode::genCode() { + if (use_peephole_) { + preCode(); + } + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + reg64_t reg_ptr_wp = r12; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + if (use_peephole_) { + mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); + } + + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + /* gates: W_ch, W_ih, W_fh, W_oh */ + ymm_t ymm_c = ymm_t(0); + ymm_t ymm_i = ymm_t(1); + ymm_t ymm_f = ymm_t(2); + ymm_t ymm_o = ymm_t(3); + ymm_t ymm_ct_1 = ymm_t(4); + ymm_t ymm_wp0 = ymm_t(5); + ymm_t ymm_wp1 = ymm_t(6); + ymm_t ymm_wp2 = ymm_t(7); + vmovups(ymm_c, ptr[reg_ptr_gates + offset]); + vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); + vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); + vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); + if (!compute_c1h1_) { + vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); + } + if (use_peephole_) { + vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); + vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); + vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); + } + /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ + // act_cand(c) + act(ymm_c, ymm_c, act_cand_); + // act_gate(i) or act_gate(ct_1 * wp0 + i) + if (!compute_c1h1_ && use_peephole_) { + vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); + vaddps(ymm_i, ymm_i, ymm_wp0); + } + act(ymm_i, ymm_i, act_gate_); + vmulps(ymm_c, ymm_c, ymm_i); + if (!compute_c1h1_) { + // act_gate(f) or act_gate(ct_1 * wp1 + f) + if (use_peephole_) { + vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); + vaddps(ymm_f, ymm_f, ymm_wp1); + } + act(ymm_f, ymm_f, act_gate_); + // ct + vmulps(ymm_f, ymm_f, ymm_ct_1); + vaddps(ymm_f, ymm_f, ymm_c); + } + /* H_t = act_cell(C_t) * act_gate(o) */ + // act_cell(C_t) + ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; + ymm_t ymm_tmp = ymm_i; + act(ymm_tmp, ymm_ct, act_cell_); + // act_gate(o) or act_gate(ct * wp2 + o) + if (use_peephole_) { + vmulps(ymm_wp2, ymm_ct, ymm_wp2); + vaddps(ymm_o, ymm_o, ymm_wp2); + } + act(ymm_o, ymm_o, act_gate_); + // ht + vmulps(ymm_o, ymm_o, ymm_tmp); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + if (use_peephole_) { + postCode(); + } else { + ret(); + } +} + +#define DECLARE_LSTM_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + /* TODO(TJ): enable more */ \ + bool UseMe(const lstm_attr_t& attr) const override { \ + return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ + } \ + size_t CodeSize(const lstm_attr_t& attr) const override { \ + return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode( \ + const lstm_attr_t& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_LSTM_CREATOR(LSTMCtHt); +DECLARE_LSTM_CREATOR(LSTMC1H1); + +#undef DECLARE_LSTM_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(lstmctht, gen::LSTMCtHtCreator); +REGISTER_JITKERNEL_GEN(lstmc1h1, gen::LSTMC1H1Creator); diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h new file mode 100644 index 00000000000..cb8705c6d95 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/lstm.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class LSTMJitCode : public VActJitCode { + public: + explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr, + size_t code_size, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + compute_c1h1_(compute_c1h1) { + auto typeExchange = [](KernelType type) -> gen::operand_type { + if (type == KernelType::vsigmoid) { + return operand_type::sigmoid; + } else if (type == KernelType::vrelu) { + return operand_type::relu; + } else if (type == KernelType::vtanh) { + return operand_type::tanh; + } else if (type == KernelType::videntity) { + return operand_type::identity; + } else { + LOG(FATAL) << "Do not support this jit::KernelType: " << type; + } + return operand_type::identity; + }; + num_ = attr.d; + use_peephole_ = attr.use_peephole; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + act_cell_ = typeExchange(attr.act_cell); + + this->genCode(); + } + + const char* name() const override { + std::string base = "LSTMJitCode"; + if (use_peephole_) { + base += "_Peephole"; + } + if (compute_c1h1_) { + base += "_C1H1"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + AddTypeStr(act_cell_); + return base.c_str(); + } + void genCode() override; + + protected: + int num_; + bool compute_c1h1_; + bool use_peephole_; + operand_type act_gate_; + operand_type act_cand_; + operand_type act_cell_; + reg64_t param1{abi_param1}; +}; + +#define DECLARE_LSTM_JITCODE(name, compute_c1h1) \ + class name##JitCode : public LSTMJitCode { \ + public: \ + explicit name##JitCode(const lstm_attr_t& attr, size_t code_size, \ + void* code_ptr = nullptr) \ + : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {} \ + }; + +DECLARE_LSTM_JITCODE(LSTMCtHt, false); +DECLARE_LSTM_JITCODE(LSTMC1H1, true); + +#undef DECLARE_LSTM_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e211276d189..36f8eb6e7b6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -236,7 +236,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // test result from Get function - VLOG(10) << "Test Get function "; + // VLOG(10) << "Test Get function "; auto tgt = jit::Get(attr); test(tgt, args...); } @@ -338,9 +338,6 @@ void TestLSTMKernel() { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { for (auto& act_cell : all_acts) { - std::string info = act_gate + act_cand + act_cell + - (use_peephole ? "peephole_" : "") + "size_" + - std::to_string(d); const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); @@ -370,7 +367,7 @@ void TestLSTMKernel() { step.checked = checked_data; } ref(&step, &attr); - + VLOG(10) << attr; TestAllImpls, PlaceType, std::vector, std::vector, std::vector, std::vector, std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, @@ -390,7 +387,6 @@ void TestGRUKernel() { for (int d : TestSizes()) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { - std::string info = act_gate + act_cand + "size_" + std::to_string(d); const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); auto ref = jit::GetRefer>(); @@ -409,7 +405,7 @@ void TestGRUKernel() { step.ht_1 = ht_1_data; step.ht = ht_ref_data; ref(&step, &attr); - + VLOG(10) << attr; TestAllImpls, PlaceType, std::vector, std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, attr); -- GitLab From 3713d08d40840c1c9b3485a2cea3facce012d0d2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sun, 16 Dec 2018 15:34:20 +0000 Subject: [PATCH 0262/2367] enable jitcode gru --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 3 + paddle/fluid/operators/jit/gen/gru.cc | 116 ++++++++++++++++++ paddle/fluid/operators/jit/gen/gru.h | 116 ++++++++++++++++++ 3 files changed, 235 insertions(+) create mode 100644 paddle/fluid/operators/jit/gen/gru.cc create mode 100644 paddle/fluid/operators/jit/gen/gru.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 81a6314bd25..8ad9587b5ef 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -22,3 +22,6 @@ USE_JITKERNEL_GEN(vsigmoid) USE_JITKERNEL_GEN(vtanh) USE_JITKERNEL_GEN(lstmctht) USE_JITKERNEL_GEN(lstmc1h1) +USE_JITKERNEL_GEN(gruh1) +USE_JITKERNEL_GEN(gruhtpart1) +USE_JITKERNEL_GEN(gruhtpart2) diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc new file mode 100644 index 00000000000..ec89880a0c3 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/gru.h" +#include // offsetof +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void GRUJitCode::genCode() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ht_1 = r9; + reg64_t reg_ptr_ht = r10; + mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); + mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); + ymm_t ymm_one = ymm_t(0); + + if (id_ == 2) { + reg64_t reg_ptr_tmp = r11; + mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); + } + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + ymm_t ymm_u = ymm_t(1); + ymm_t ymm_r = ymm_t(2); + ymm_t ymm_s = ymm_t(3); + ymm_t ymm_ht_1 = ymm_t(4); + // W: {W_update, W_reset; W_state} + if (id_ == 0 || id_ == 2) { + vmovups(ymm_u, ptr[reg_ptr_gates + offset]); + vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); + } + if (id_ == 1) { + vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); + } + if (id_ == 1 || id_ == 2) { + vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); + } + + if (id_ == 0) { + // ht = act_gate(u) * act_cand(s) + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_s); + } else if (id_ == 1) { + // ht = act_gate(r) * ht_1 + act(ymm_r, ymm_r, act_gate_); + vmulps(ymm_r, ymm_r, ymm_ht_1); + vmovups(ptr[reg_ptr_ht + offset], ymm_r); + } else if (id_ == 2) { + // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 + ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vsubps(ymm_u, ymm_one_inner, ymm_u); + vmulps(ymm_u, ymm_ht_1, ymm_u); + vaddps(ymm_u, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_u); + } + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + ret(); +} + +#define DECLARE_GRU_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + /* TODO(TJ): enable more */ \ + bool UseMe(const gru_attr_t& attr) const override { \ + return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ + } \ + size_t CodeSize(const gru_attr_t& attr) const override { \ + return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \ + } \ + std::unique_ptr CreateJitCode( \ + const gru_attr_t& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_GRU_CREATOR(GRUH1); +DECLARE_GRU_CREATOR(GRUHtPart1); +DECLARE_GRU_CREATOR(GRUHtPart2); + +#undef DECLARE_GRU_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(gruh1, gen::GRUH1Creator); +REGISTER_JITKERNEL_GEN(gruhtpart1, gen::GRUHtPart1Creator); +REGISTER_JITKERNEL_GEN(gruhtpart2, gen::GRUHtPart2Creator); diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h new file mode 100644 index 00000000000..bab1c6a4eee --- /dev/null +++ b/paddle/fluid/operators/jit/gen/gru.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class GRUJitCode : public VActJitCode { + public: + explicit GRUJitCode(int id, const gru_attr_t& attr, size_t code_size, + void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + id_(id) { + auto typeExchange = [](KernelType type) -> gen::operand_type { + if (type == KernelType::vsigmoid) { + return operand_type::sigmoid; + } else if (type == KernelType::vrelu) { + return operand_type::relu; + } else if (type == KernelType::vtanh) { + return operand_type::tanh; + } else if (type == KernelType::videntity) { + return operand_type::identity; + } else { + LOG(FATAL) << "Do not support this jit::KernelType: " << type; + } + return operand_type::identity; + }; + num_ = attr.d; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + + this->genCode(); + } + + const char* name() const override { + std::string base = "GRUJitCode"; + if (id_ == 0) { + base += "_H1"; + } else if (id_ == 1) { + base += "_HtPart1"; + } else if (id_ == 2) { + base += "_HtPart2"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + return base.c_str(); + } + void genCode() override; + + protected: + int id_; + int num_; + operand_type act_gate_; + operand_type act_cand_; + reg64_t param1{abi_param1}; +}; + +#define DECLARE_GRU_JITCODE(name, id) \ + class name##JitCode : public GRUJitCode { \ + public: \ + explicit name##JitCode(const gru_attr_t& attr, size_t code_size, \ + void* code_ptr = nullptr) \ + : GRUJitCode(id, attr, code_size, code_ptr) {} \ + }; + +DECLARE_GRU_JITCODE(GRUH1, 0); +DECLARE_GRU_JITCODE(GRUHtPart1, 1); +DECLARE_GRU_JITCODE(GRUHtPart2, 2); + +#undef DECLARE_GRU_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle -- GitLab From 29c772663a7905b64fec66e452d77cd6b7ec9449 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 0263/2367] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/ps_instance.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 6b44d0cd16f..91f53102b6c 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -import helper as dist_helper -import sys +from .helper import MPIHelper class PaddlePSInstance(object): @@ -26,7 +25,7 @@ class PaddlePSInstance(object): """ def __init__(self, server_worker_mode, proc_per_node): - self.dh = dist_helper.MPIHelper() + self.dh = MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node -- GitLab From 178c47c074ba0c1294dd8e0e8f38faa0a5e17ab3 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 0264/2367] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/helper.py | 7 ++++--- python/requirements.txt | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index ca6dd5dabfa..999c8d77b83 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from mpi4py import MPI import ps_pb2 as pslib @@ -59,7 +58,7 @@ class FileSystem(object): class MPIHelper(object): """ - MPIHelper is a wrapper of mpi4py, supprot get_rank get_size etc. + MPIHelper is a wrapper of mpi4py, support get_rank get_size etc. Args: No params Examples: @@ -68,7 +67,9 @@ class MPIHelper(object): """ def __init__(self): + from mpi4py import MPI self.comm = MPI.COMM_WORLD + self.MPI = MPI def get_rank(self): return self.comm.Get_rank() @@ -86,4 +87,4 @@ class MPIHelper(object): return socket.gethostname() def finalize(self): - MPI.Finalize() + self.MPI.Finalize() diff --git a/python/requirements.txt b/python/requirements.txt index 36313333b2b..2f81d85df06 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,4 +9,3 @@ Pillow nltk>=3.2.2 graphviz six -mpi4py==3.0.0 -- GitLab From 43028f655d44eb524fc988a1645b993cefd08e6a Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 0265/2367] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/helper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 999c8d77b83..cdde5403cda 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ps_pb2 as pslib - class FileSystem(object): """ @@ -37,6 +35,7 @@ class FileSystem(object): assert user != None assert passwd != None assert hadoop_bin != None + import ps_pb2 as pslib self.fs_client = pslib.FsClientParameter() #if fs_type == "afs": # fs_client.fs_type = pslib.FsApiType.AFS -- GitLab From 921b7f452a2a4bf26f3aa288365401c35719903f Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 0266/2367] add API.spec test=develop --- paddle/fluid/API.spec | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 26113ee7e90..e156945147f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -38,7 +38,15 @@ paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], va paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) +paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -- GitLab From fcde2b2725566a9cde0c8930d2e80e6a044d6784 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 10:29:59 +0800 Subject: [PATCH 0267/2367] add ForRangeIn --- paddle/fluid/operators/optimizers/adam_op.h | 7 ++- paddle/fluid/platform/for_range.h | 55 +++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 5870557bb7b..e8b977e2d96 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -359,14 +359,17 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); if (lazy_mode) { + std::vector id_vector; size_t row_count = grad_merge.rows().size(); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = rows[row_index] * row_numel + offset; - T g = grad_data[row_index * row_numel + offset]; - functor.adam_update(i, g); + id_vector.push_back(i); } } + platform::ForRangeIn for_range_in( + static_cast(ctx.device_context()), id_vector); + for_range_in(functor); } else { platform::ForRange for_range( static_cast(ctx.device_context()), diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index c153e80fe42..9fbaa36723b 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,11 +13,38 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace platform { +template +struct ForRangeIn { + ForRangeIn(const DeviceContext& dev_ctx, std::vector range); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRangeIn { + ForRangeIn(const CPUDeviceContext& dev_ctx, std::vector range) + : range_(range) {} + + template + void operator()(Function func) const { + for (auto i : range_) { + func(i); + } + } + + std::vector range_; +}; + template struct ForRange { ForRange(const DeviceContext& dev_ctx, size_t limit); @@ -79,6 +106,34 @@ struct ForRange { int limit_; }; +template +__global__ static void ForRangeInElemwiseOp(Function func, T* vector, + int vector_size) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < vector_size) { + func(vector[idx]); + } +} + +template <> +struct ForRangeIn { + ForRange(const CUDADeviceContext& dev_ctx, std::vector range) + : dev_ctx_(dev_ctx), range_(range) {} + + template + inline void operator()(Function func) const { + constexpr int num_threads = 1024; + int block_size = range_.size() <= num_threads ? limit_ : num_threads; + int grid_size = (range_.size() + num_threads - 1) / num_threads; + + ForRangeInElemwiseOp<<>>( + func, range_.data(), range_.size()); + } + + const CUDADeviceContext& dev_ctx_; + framework::Vector range_; +}; + #endif } // namespace platform -- GitLab From 01dd9061a007dea207d304d1099cfe012a47fcb7 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 10:42:21 +0800 Subject: [PATCH 0268/2367] add avx support for windows test=develop --- CMakeLists.txt | 2 - paddle/fluid/operators/math/cpu_vec.h | 3 - .../math/detail/activation_functions.h | 6 +- .../operators/math/detail/avx_functions.cc | 4 +- .../fluid/operators/math/detail/avx_mathfun.h | 731 ++++++++++++++++++ paddle/fluid/operators/math/jit_code.cc | 39 +- paddle/fluid/operators/math/jit_code.h | 1 - .../operators/math/jit_kernel_crf_decode.cc | 7 +- .../operators/math/jit_kernel_layer_norm.cc | 7 +- 9 files changed, 757 insertions(+), 43 deletions(-) create mode 100644 paddle/fluid/operators/math/detail/avx_mathfun.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1594e798a2b..653ae4ffe53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,8 +131,6 @@ if (APPLE OR WIN32) endif() if (WIN32) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index e1e4d168db3..57726956cfb 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" -#ifdef __AVX__ -#include -#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 2b3d38d95a1..24df1f93edd 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,14 +15,10 @@ limitations under the License. */ #pragma once #include #include - +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc index 5641f914523..022ffc53377 100644 --- a/paddle/fluid/operators/math/detail/avx_functions.cc +++ b/paddle/fluid/operators/math/detail/avx_functions.cc @@ -14,10 +14,8 @@ limitations under the License. */ #ifdef __AVX__ -#include #include "paddle/fluid/operators/math/detail/activation_functions.h" -// TODO(qingqing) refine this dependence -#include "paddle/legacy/cuda/src/avx_mathfun.h" +#include "paddle/fluid/operators/math/detail/avx_mathfun.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h new file mode 100644 index 00000000000..d7cf91134e4 --- /dev/null +++ b/paddle/fluid/operators/math/detail/avx_mathfun.h @@ -0,0 +1,731 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include "paddle/fluid/platform/cpu_info.h" + +/* __m128 is ugly to write */ +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) +typedef __m128i v4si; // vector of 8 int (avx) + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \ + Val, Val} + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} + +_PS256_CONST(1, 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + +#ifndef __AVX2__ + +typedef union imm_xmm_union { + v8si imm; + v4si xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ + } + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } + +#define AVX2_BITOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, int a) { \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + v4si x1, x2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, a); \ + x2 = _mm_##fn(x2, a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 bitshift ops" +AVX2_BITOP_USING_SSE2(slli_epi32) +AVX2_BITOP_USING_SSE2(srli_epi32) + +#define AVX2_INTOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + v4si x1, x2; \ + v4si y1, y2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 integer ops" +AVX2_INTOP_USING_SSE2(and_si128) +AVX2_INTOP_USING_SSE2(andnot_si128) +AVX2_INTOP_USING_SSE2(cmpeq_epi32) +AVX2_INTOP_USING_SSE2(sub_epi32) +AVX2_INTOP_USING_SSE2(add_epi32) +#define avx2_mm256_and_si256 avx2_mm256_and_si128 +#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 +#else +#define avx2_mm256_slli_epi32 _mm256_slli_epi32 +#define avx2_mm256_srli_epi32 _mm256_srli_epi32 +#define avx2_mm256_and_si256 _mm256_and_si256 +#define avx2_mm256_andnot_si256 _mm256_andnot_si256 +#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 +#define avx2_mm256_sub_epi32 _mm256_sub_epi32 +#define avx2_mm256_add_epi32 _mm256_add_epi32 +#endif /* __AVX2__ */ + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps( + x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + // imm0 = _mm256_cvttps_epi32(fx); + // tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, substract 1 */ + // v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + +/* evaluation of 8 sines at onces using AVX intrisics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + +#ifndef __AVX2__ + v4si imm0_1, imm0_2; + v4si imm2_1, imm2_2; +#endif + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + +/* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives +*/ + +#ifdef __AVX2__ + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4= 256 diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index e2b47614355..6d22bf67572 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -47,7 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -#define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index eeb305a88be..ac2d29f1c18 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -16,9 +16,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { /* AVX instructions.*/ \ __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \ __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \ - __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \ - __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \ + __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0); \ + __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1); \ lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \ hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \ lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \ diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index cb49e66488b..e21092037a2 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -13,9 +13,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel { if (rest_ != 0) { \ j = offset + this->num_ - block; \ tmp = _mm256_loadu_ps((const float*)x + j); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ @@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel { j = offset + this->num_ - block; \ tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \ tmp = _mm256_mul_ps(tmp, tmp); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ -- GitLab From e196fa367bc6087f08bfce44bdc194ed426c69cf Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 17 Dec 2018 10:52:05 +0800 Subject: [PATCH 0269/2367] update ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py new file mode 100644 index 00000000000..f08b270d89b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_nce_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_nce_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_nce_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_nce_op_one_pserver(place, port0) + self._run_nce_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() -- GitLab From 5553c0b0da5884a062e5b7b136c30eb12a7d4d6b Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 0270/2367] add API.spec test=develop --- paddle/fluid/API.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e156945147f..fe2ee3f98d4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,14 +37,14 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) -- GitLab From 763e8fdf02ebe00b845680b264b7a5c6a56b61ae Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 11:17:10 +0800 Subject: [PATCH 0271/2367] fix compile error --- paddle/fluid/platform/for_range.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 9fbaa36723b..a767bf92993 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -117,17 +117,18 @@ __global__ static void ForRangeInElemwiseOp(Function func, T* vector, template <> struct ForRangeIn { - ForRange(const CUDADeviceContext& dev_ctx, std::vector range) + ForRangeIn(const CUDADeviceContext& dev_ctx, std::vector range) : dev_ctx_(dev_ctx), range_(range) {} template inline void operator()(Function func) const { constexpr int num_threads = 1024; - int block_size = range_.size() <= num_threads ? limit_ : num_threads; + int range_size = range_.size(); + int block_size = range_size <= num_threads ? range_size : num_threads; int grid_size = (range_.size() + num_threads - 1) / num_threads; ForRangeInElemwiseOp<<>>( - func, range_.data(), range_.size()); + func, range_.data(), range_size); } const CUDADeviceContext& dev_ctx_; -- GitLab From e439257ef7880e9b0b19d7b0c7ef8965fc180279 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 17 Dec 2018 11:24:19 +0800 Subject: [PATCH 0272/2367] Fix include style test=develop --- paddle/fluid/framework/tensor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6ddc07af9a7..6a1cbe5cd56 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,15 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include - #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -- GitLab From d519fd69441be6bc1bf1d921d23144094a90bfb8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 11:38:54 +0800 Subject: [PATCH 0273/2367] test=develop --- paddle/fluid/platform/cpu_info.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 55dba545ff1..70224f94671 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -16,6 +16,32 @@ limitations under the License. */ #include +#ifdef _WIN32 +#if defined(__AVX2__) +#include //avx2 +#elif defined(__AVX__) +#include //avx +#endif // AVX +#else // WIN32 +#ifdef __AVX__ +#include +#endif +#endif // WIN32 + +#if defined(_WIN32) +#define ALIGN32_BEG __declspec(align(32)) +#define ALIGN32_END +#else +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) +#endif // _WIN32 + +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace platform { -- GitLab From bd0067b26cd899b63bbf314aa3ba5f3ac22327e6 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 12:39:33 +0800 Subject: [PATCH 0274/2367] Polish code test=develop --- paddle/fluid/operators/math/selected_rows_functor.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index b87c9461e88..0d63f641c86 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -275,7 +275,8 @@ struct MergeAdd { void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; @@ -313,7 +314,8 @@ struct MergeAdd { void operator()(const platform::CUDADeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; -- GitLab From fd144954ed06128bab0b8b99cdb6722cc52881ba Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 13:13:53 +0800 Subject: [PATCH 0275/2367] redefine api test=develop --- paddle/fluid/API.spec | 1 - .../fluid/framework/details/build_strategy.cc | 4 +- .../fluid/framework/details/build_strategy.h | 2 + .../framework/details/execution_strategy.h | 2 +- .../details/parallel_ssa_graph_executor.cc | 1 - paddle/fluid/framework/ir/node.h | 1 - paddle/fluid/framework/parallel_executor.cc | 29 +++++----- paddle/fluid/pybind/pybind.cc | 43 +++++++------- .../unittests/parallel_executor_test_base.py | 41 +++++++------- .../unittests/test_parallel_executor_mnist.py | 56 +++++++++++-------- .../test_parallel_executor_seresnext.py | 10 ++-- .../test_parallel_executor_transformer.py | 4 +- 12 files changed, 101 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index da3b2f63474..8e6482ca981 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,7 +26,6 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f249..e9688ea2763 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -26,7 +26,9 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { - return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) || + strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be169575..f66ecd80f17 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -73,6 +73,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + bool enable_parallel_graph_{false}; + int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index d3d5b6bf541..15c496130c2 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; + enum ExecutorType { kDefault = 0, kExperimental = 1 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 214c2f76255..845c4379e6f 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -29,7 +29,6 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); // do not use threadpool for each graph execution. - strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19..10ae3a1c748 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,7 +49,6 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { - VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 63f3ef0eacc..152b9b27025 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -199,7 +199,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { PADDLE_ENFORCE( member_->use_all_reduce_, "build_strategy.reduce should be `AllReduce` if you want to use" @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { @@ -265,7 +265,7 @@ ParallelExecutor::ParallelExecutor( // ncclOp std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -287,9 +287,8 @@ ParallelExecutor::ParallelExecutor( #endif auto max_memory_size = GetEagerDeletionThreshold(); - // TODO(Yancey1989): fix gc failed on ParallelGraph executor. - if (max_memory_size >= 0 && - exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + // TODO(Yancey1989): fix gc failed on ParallelGraph strategy. + if (max_memory_size >= 0 && !build_strategy.enable_parallel_graph_) { graphs[0] = member_->PrepareGCAndRefCnts( std::move(graphs[0]), static_cast(max_memory_size)); } @@ -323,18 +322,20 @@ ParallelExecutor::ParallelExecutor( } } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs))); } else { - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1fa91114a8c..866a5137de2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -761,11 +761,6 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); - py::enum_(exec_strategy, "ExecutorType") - .value("Default", ExecutionStrategy::ExecutorType::kDefault) - .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) - .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); - exec_strategy.def(py::init()) .def_property( "num_threads", @@ -823,25 +818,17 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }) - .def_property( - "executor_type", - [](const ExecutionStrategy &self) { return self.type_; }, - [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { - self.type_ = type; - }, - R"DOC(The type is ExecutorType which is the enum ranging from Default, -ParallelGraph and Experiment: - -Default: Compile the main_program into a multi-devices graph, - and execute this graph on multi-devices with multiple threads which - specified by build_strategy.num_threads. -ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one - device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. - This approach can achieve better performance in some scenarios. -Experimental: Compile the main_program into a multi-devices graph, - and executor this graph with a faster execution mode than the Default, - this approach is on the experiments.)DOC"); + }); + + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to @@ -964,6 +951,14 @@ Experimental: Compile the main_program into a multi-devices graph, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "enable_parallel_graph", + [](const BuildStrategy &self) { return self.enable_parallel_graph_; }, + [](BuildStrategy &self, bool b) { self.enable_parallel_graph_ = b; }, + R"DOC(The type is BOOL, if set True, ParallelExecutor would build the main_program into multiple graphs, + each of the graphs would run with one device. This approach can achieve better performance in + some scenarios. Please note, this approach only supports all-reduce mode + on GPU device)DOC") .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 73b8fb74fa3..4e50614515b 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,26 +26,24 @@ import sys __all__ = ['TestParallelExecutorBase'] -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence( - self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - exec_type=fluid.ExecutionStrategy().ExecutorType.Default, - enable_sequential_execution=False): + def check_network_convergence(self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + use_parallel_graph=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + use_fast_executor=False, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -61,8 +59,8 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - scope = fluid.Scope() - with fluid.scope_guard(scope): + self.scope = fluid.Scope() + with fluid.scope_guard(self.scope): with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed @@ -80,13 +78,14 @@ class TestParallelExecutorBase(unittest.TestCase): startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay - exec_strategy.executor_type = exec_type + exec_strategy.use_experimental_executor = use_fast_executor build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.enable_sequential_execution = enable_sequential_execution + build_strategy.enable_parallel_graph = use_parallel_graph if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index fffe8bee580..c8ac6a90c1b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase def simple_fc_net(use_feed): @@ -79,30 +79,32 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - + """ all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) + """ reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - + """ for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) + """ # simple_fc def check_simple_fc_convergence(self, use_cuda, use_reduce=False, - exec_type=ExecutorType.Default): + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -114,20 +116,24 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_reduce=use_reduce, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) - def test_simple_fc(self): + def notest_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True, ExecutorType.Default) - self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) + if core.is_compiled_with_cuda(): + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence( + True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def test_simple_fc_with_new_strategy(self): + def notest_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): + def check_simple_fc_parallel_accuracy(self, + use_cuda, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -140,7 +146,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, @@ -148,7 +154,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -157,17 +163,20 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) - self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + def notest_simple_fc_parallel_accuracy(self): + if core.is_compiled_with_cuda(): + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy( + True, use_parallel_graph=True) # FIXME(Yancey1989): ParallelGraph executor type support CPU mode - self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, use_cuda, exec_type): + def check_batchnorm_fc_convergence(self, + use_cuda, + use_fast_executor, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return - if not use_cuda and exec_type == ExecutorType.ParallelGraph: - return img, label = self._init_data() @@ -176,13 +185,14 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - exec_type=exec_type) + use_fast_executor=use_fast_executor, + use_parallel_graph=use_parallel_graph) def test_batchnorm_fc(self): for use_cuda in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental, - ExecutorType.ParallelGraph): - self.check_batchnorm_fc_convergence(use_cuda, exec_type) + for use_fast_executor in (False, True): + self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + self.check_batchnorm_fc_convergence(use_cuda, False, True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index bada38894f7..531c99a8358 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import math import os @@ -282,7 +282,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=False, iter=20, delta2=1e-6, - exec_type=ExecutorType.Default, + use_parallel_graph=False, lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -303,7 +303,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=use_reduce, optimizer=optimizer(), use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -313,7 +313,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce, optimizer=optimizer(lr_scale=lr_scale), - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -327,7 +327,7 @@ class TestResnet(TestParallelExecutorBase): self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=True, - exec_type=ExecutorType.ParallelGraph, + use_parallel_graph=True, lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 8a1a3ab3cae..c3ac9d92b45 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.fluid.core as core @@ -175,6 +175,8 @@ class TestTransformer(TestParallelExecutorBase): self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) + self.check_network_convergence( + transformer, use_cuda=True, use_parallel_graph=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) -- GitLab From 728e7e88fb2c3467f6e28ef968b4e720d290b26c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 13:37:57 +0800 Subject: [PATCH 0276/2367] Use xxHash as scope's hash algorithm test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/scope.h | 26 ++++++++++++++++++++------ python/paddle/fluid/profiler.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cea4a448574..5dca5ac5988 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -82,7 +82,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index b1abe75d765..4f79d982609 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -201,7 +201,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it.value().release()); + vars_[new_name].reset(origin_it->second.release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b232d267dbc..77ef18414d0 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,15 +14,18 @@ limitations under the License. */ #pragma once +extern "C" { +#include +} + #include #include #include #include +#include #include #include -#include // NOLINT - #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -35,6 +38,14 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; +namespace inner { +struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } +}; +} // namespace inner + /** * @brief Scope that manage all variables. * @@ -99,11 +110,14 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable tsl::robin_map< - std::string, std::unique_ptr, std::hash, - std::equal_to, - std::allocator>>, true> + mutable std::unordered_map, + inner::KeyHasher> vars_; + // mutable tsl::robin_map< + // std::string, std::unique_ptr, std::hash, + // std::equal_to, + // std::allocator>>, true> + // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 8df2e01b037..78f7a6ac085 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -93,7 +93,7 @@ def cuda_profiler(output_file, output_mode=None, config=None): with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) #Comment this for nvprof - #core.nvprof_init(output_file, output_mode, config_file) + core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield -- GitLab From 4de1a8bd9d55469f0612cf8f60b749681a5d657c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 17 Dec 2018 14:15:27 +0800 Subject: [PATCH 0277/2367] Remove unused cmake log test=develop --- cmake/external/python.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 52ad02a3551..623c53f4f75 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) -message(STATUS ${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) -- GitLab From a7d6b1f92141f398cb442e3f5eee99d3ac156265 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 14:17:26 +0800 Subject: [PATCH 0278/2367] code cleanup test=develop --- .../framework/details/computation_op_handle.h | 1 - .../details/multi_devices_graph_pass.cc | 1 + .../scope_buffered_ssa_graph_executor.cc | 2 ++ .../details/threaded_ssa_graph_executor.h | 1 - paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/ir/node.h | 1 + .../unittests/test_parallel_executor_dry_run.py | 10 ++++------ .../unittests/test_parallel_executor_mnist.py | 17 +++++++++-------- 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 5b8b70c5641..601ae4f8c6d 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,7 +17,6 @@ #include #include -#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 5b82805ad93..2ab7da2d57c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,6 +134,7 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; +static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index edb7b5e70ac..f4320790876 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -41,10 +41,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; + for (auto &info : var_infos_) { if (scope->FindVar(info.name_) != nullptr) { continue; } + if (info.persistable_) { // Persistable InitializeVariable(scope->Var(info.name_), info.type_); } else { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b45afbc0461..24da56c09e3 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,7 +24,6 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 7de6025a28a..30da029ca2a 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 10ae3a1c748..d2a393b3f19 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,6 +49,7 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { + VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index eff76ce0d49..18d95c94ad3 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,8 +17,6 @@ import unittest import logging import six -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestBase(unittest.TestCase): def main(self, @@ -26,7 +24,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - exec_type=ExecutorType.Default): + use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -45,7 +43,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.executor_type = exec_type + exe_strategy.use_experimental_executor = use_experimental_executor pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -58,11 +56,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental): + for use_experimental_executor in (False, True): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - exec_type=exec_type) + use_experimental_executor=use_experimental_executor) @staticmethod def network_func(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index c8ac6a90c1b..7d2349fad4c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -79,26 +79,25 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - """ + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) - """ + reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - """ + for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) - """ # simple_fc def check_simple_fc_convergence(self, @@ -118,7 +117,7 @@ class TestMNIST(TestParallelExecutorBase): use_reduce=use_reduce, use_parallel_graph=use_parallel_graph) - def notest_simple_fc(self): + def test_simple_fc(self): # use_cuda if core.is_compiled_with_cuda(): self.check_simple_fc_convergence(True) @@ -126,7 +125,7 @@ class TestMNIST(TestParallelExecutorBase): True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def notest_simple_fc_with_new_strategy(self): + def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) @@ -163,7 +162,7 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def notest_simple_fc_parallel_accuracy(self): + def test_simple_fc_parallel_accuracy(self): if core.is_compiled_with_cuda(): self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy( @@ -192,7 +191,9 @@ class TestMNIST(TestParallelExecutorBase): for use_cuda in (False, True): for use_fast_executor in (False, True): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) - self.check_batchnorm_fc_convergence(use_cuda, False, True) + + self.check_batchnorm_fc_convergence( + use_cuda=True, use_fast_executor=False, use_parallel_graph=True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. -- GitLab From 24eb8f038c9e7c5fddac7ebd5ececd0459e2250d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 15:03:17 +0800 Subject: [PATCH 0279/2367] Fix bug test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 46 ++++++++++++++------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index c9e27b75472..fa51c66eeb7 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -279,26 +279,42 @@ struct SparseAdamFunctor { T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - for (size_t i = 0U, j = 0U; i != numel; ++i) { - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T p = param_[i]; + size_t row_count = numel / row_numel_; - // Calculation + for (size_t i = 0U, j = 0U; i != row_count; ++i) { if (i == *(rows_ + j)) { - T g = grad_[j * row_numel_]; - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + for (size_t k = 0U; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + T g = grad_[j * row_numel_ + k]; + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } ++j; } else { - mom1 = beta1_ * mom1; - mom2 = beta2_ * mom2; + for (size_t k = 0U; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } } - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; } } }; -- GitLab From 41456e172384667de0b8b272fa5714c96a73a297 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 15:19:17 +0800 Subject: [PATCH 0280/2367] Remove the useless definition test=develop --- paddle/fluid/platform/cpu_info.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 70224f94671..c70e3be858f 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -36,12 +36,6 @@ limitations under the License. */ #define ALIGN32_END __attribute__((aligned(32))) #endif // _WIN32 -#if defined(_WIN32) -#if defined(__AVX2__) || defined(__AVX__) -inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } -#endif -#endif - namespace paddle { namespace platform { -- GitLab From bc4f16ca6f4bcb938683086c0eb325366729cd25 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 0281/2367] remove some comments --- paddle/fluid/API.spec | 4 ++-- python/paddle/fluid/distributed/helper.py | 6 +----- python/paddle/fluid/distributed/node.py | 4 ---- python/paddle/fluid/distributed/ps_instance.py | 3 --- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e156945147f..fe2ee3f98d4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,14 +37,14 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index cdde5403cda..06d3d0315cf 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -28,7 +28,7 @@ class FileSystem(object): def __init__(self, fs_type="afs", - uri="afs://tianqi.afs.baidu.com:9902", + uri="afs://xx", user=None, passwd=None, hadoop_bin=""): @@ -37,10 +37,6 @@ class FileSystem(object): assert hadoop_bin != None import ps_pb2 as pslib self.fs_client = pslib.FsClientParameter() - #if fs_type == "afs": - # fs_client.fs_type = pslib.FsApiType.AFS - #else: - # fs_client.fs_type = pslib.FsApiType.HDFS self.fs_client.uri = uri self.fs_client.user = user self.fs_client.passwd = passwd diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 117da9cff82..41e0d64e0b7 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -75,8 +75,6 @@ class DownpourServer(Server): table.accessor.embedx_dim = 8 table.accessor.embedx_threshold = 5 table.accessor.fea_dim = 11 - #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, - # slot_value_var[0].shape, 1)) table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 table.accessor.downpour_accessor_param.click_coeff = 2 table.accessor.downpour_accessor_param.base_threshold = 0.2 @@ -134,8 +132,6 @@ class DownpourWorker(Worker): def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() - #self.worker_.pull_dense_per_batch = window - #self.worker_.push_dense_per_batch = window def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_vars): diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 91f53102b6c..d3ce3ce6934 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -59,9 +59,6 @@ class PaddlePSInstance(object): else: self._node_type = -1 - #if self._rankid == 0: - #print "node type: ", self._node_type - def _split_comm(self): if self.is_server(): self._comm = self.dh.comm.Split(self._node_type) -- GitLab From 5f0358add9767390b8bc329c97236f8d72ce758e Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 17 Dec 2018 16:18:10 +0800 Subject: [PATCH 0282/2367] async_executor stop add barrier_all & finalize --- python/paddle/fluid/async_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index bd32138651f..3181654feb8 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -237,6 +237,8 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.stop_server() self.instance.barrier_worker() #sync + self.instance.barrier_all() + self.instance.finalize() def init_server(self, dist_desc): """ -- GitLab From 4e3e68dfae0f72c848aea60771258b8600202368 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 17 Dec 2018 08:19:54 +0000 Subject: [PATCH 0283/2367] copy trt lib to inference lib test=develop --- cmake/inference_lib.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d8..b2fb042f779 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -191,6 +191,13 @@ if (WITH_ANAKIN AND WITH_MKL) list(APPEND inference_deps anakin_inference_lib) endif () +if (TENSORRT_FOUND) + copy(tensorrt_lib DEPS ${inference_deps} + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) +endif () + + set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* -- GitLab From 96604fda1016bd91c25ace7e7510f0a746ed3797 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 16:59:20 +0800 Subject: [PATCH 0284/2367] fix gpu data test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 3 ++- paddle/fluid/platform/for_range.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index e8b977e2d96..01d3d600540 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -361,9 +361,10 @@ class AdamOpKernel : public framework::OpKernel { if (lazy_mode) { std::vector id_vector; size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = rows[row_index] * row_numel + offset; + size_t i = cpu_rows[row_index] * row_numel + offset; id_vector.push_back(i); } } diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index a767bf92993..ab00d8b8f57 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -128,7 +128,7 @@ struct ForRangeIn { int grid_size = (range_.size() + num_threads - 1) / num_threads; ForRangeInElemwiseOp<<>>( - func, range_.data(), range_size); + func, range_.CUDAData(dev_ctx_.GetPlace()), range_size); } const CUDADeviceContext& dev_ctx_; -- GitLab From aa41ee75a16509cb16793d7fdbbbfa3ce2dab69f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:13:26 +0800 Subject: [PATCH 0285/2367] Accelerate PADDLE_ENFORCE --- paddle/fluid/framework/operator.h | 12 ++++-- paddle/fluid/platform/enforce.h | 68 +++++++++++++++++++------------ 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce..63a8bc574f3 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -101,8 +107,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07f..3c03a902796 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,11 +258,21 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define PADDLE_JUDGE + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(cond))) { \ + ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ @@ -266,7 +280,7 @@ inline void throw_on_error(T e) { } while (false) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #define PADDLE_THROW_EOF() \ -- GitLab From 27a0d6c2dc7a1fb26ec3bfc0b44840300685b993 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:17:13 +0800 Subject: [PATCH 0286/2367] Polish code test=develop --- CMakeLists.txt | 1 - cmake/external/robin_map.cmake | 31 ------------------------------- paddle/fluid/framework/scope.h | 5 ----- python/paddle/fluid/profiler.py | 1 - 4 files changed, 38 deletions(-) delete mode 100644 cmake/external/robin_map.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2e0ecf6c5..1594e798a2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,6 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake deleted file mode 100644 index ddaf59536cb..00000000000 --- a/cmake/external/robin_map.cmake +++ /dev/null @@ -1,31 +0,0 @@ -include(ExternalProject) - -set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) -set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) - -include_directories(${ROBIN_MAP_INCLUDE_DIR}) - -ExternalProject_Add( - extern_robin_map - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Tessil/robin-map.git" - GIT_TAG "v0.5.0" - PREFIX ${ROBIN_MAP_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -if(${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(robin_map STATIC ${dummyfile}) -else() - add_library(robin_map INTERFACE) -endif() - -add_dependencies(robin_map extern_robin_map) - -LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 77ef18414d0..9a715ac9b95 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -113,11 +113,6 @@ class Scope { mutable std::unordered_map, inner::KeyHasher> vars_; - // mutable tsl::robin_map< - // std::string, std::unique_ptr, std::hash, - // std::equal_to, - // std::allocator>>, true> - // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 78f7a6ac085..e05885f5f5b 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - #Comment this for nvprof core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() -- GitLab From 49870f507de0d68fe23cd60479dab9da65d2d916 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:52:21 +0800 Subject: [PATCH 0287/2367] delete unused code test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + paddle/fluid/framework/details/multi_devices_graph_pass.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 5a4f218077d..59a0aef480b 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,6 +51,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA + // Find NCCL ID from the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 2ab7da2d57c..5b82805ad93 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,7 +134,6 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; -static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); -- GitLab From d3a4da5cf663a37d77af4670bdad85e06b32fae3 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:53:44 +0800 Subject: [PATCH 0288/2367] fix comment test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 59a0aef480b..6bca299813f 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -109,7 +109,7 @@ void AllReduceOpHandle::RunImpl() { buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); // TODO(Yancey1989): synchronize here can get better performance - // if don't use NCCL group call, but need more profileing. + // if don't use NCCL group call, but need more profiling. if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } -- GitLab From 1a6d2cfe395fdc99b919513d45a042e119b92e11 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 17 Dec 2018 19:04:09 +0800 Subject: [PATCH 0289/2367] add test_analyzer_mobilenet test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 9 ++++++++- .../inference/tests/api/analyzer_vis_tester.cc | 14 -------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8a4bc04b678..5862fedb9aa 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -85,12 +85,19 @@ if (NOT EXISTS ${OCR_INSTALL_DIR}) endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +# mobilenet with transpose +set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") +if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") +endif() +inference_analysis_api_test(test_analyzer_mobilenet ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) + # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") # mobilenet with depthwise_conv op -inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet +inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") # anakin diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index adaa338e289..4700afdc86c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,20 +93,6 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - const float ocr_result_data[] = { - 5.273636460856323538e-08, 3.296741795111302054e-07, - 1.873261190610264748e-08, 3.403730275408634043e-08, - 3.383312474625199684e-08}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); - PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); - for (size_t i = 0; i < std::min(5UL, size); i++) { - EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3); - } - } } TEST(Analyzer_vis, profile) { profile(); } -- GitLab From addded48e1c94e0299774012399bf4f4ab773544 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 17 Dec 2018 19:31:25 +0800 Subject: [PATCH 0290/2367] test=develop (#14898) --- paddle/fluid/operators/distributed/grpc_server.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index c3974138f4d..cda102e78d2 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest( while (true) { VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; + LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!"; break; } @@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest( // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event" - << " context:" << base->Status2String(rpc_name); + VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; -- GitLab From 1141db811455eadc6b44bbb3785b0510f1f51870 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 19:32:32 +0800 Subject: [PATCH 0291/2367] update test_adam_op test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 1 + .../fluid/tests/unittests/test_adam_op.py | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 01d3d600540..8fc6689ff1a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -358,6 +358,7 @@ class AdamOpKernel : public framework::OpKernel { lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); + VLOG(3) << "lazy_mode :" << lazy_mode; if (lazy_mode) { std::vector id_vector; size_t row_count = grad_merge.rows().size(); diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 461196689c9..ff7fc5100eb 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -219,14 +219,25 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, moment2_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel]) - for idx, row_id in enumerate(rows): + def update_row(row_id, update_value): moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 - ) * np_grad[idx] + ) * update_value moment2_out[row_id] = beta2 * moment2[row_id] + ( - 1 - beta2) * np.square(np_grad[idx]) + 1 - beta2) * np.square(update_value) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( np.sqrt(moment2_out[row_id]) + epsilon)) + + if lazy_mode: + for idx, row_id in enumerate(rows): + update_row(row_id, np_grad[idx]) + else: + for row_id in range(param_out.shape[0]): + update_value = np.zeros(np_grad[0].shape).astype("float32") + if row_id in rows: + update_value = np_grad[rows.index(row_id)] + update_row(row_id, update_value) + return param_out, moment1_out, moment2_out @@ -249,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase): 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } + self.init_output = np.full((height, row_numel), 0.0).astype("float32") self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} grad_selected_rows = scope.var('Grad').get_selected_rows() @@ -286,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase): op_args[s] = s for s in self.outputs: var = scope.var(s).get_tensor() - var.set(self.outputs[s], place) + var.set(self.init_output, place) op_args[s] = s for k in self.attrs: op_args[k] = self.attrs[k] @@ -300,13 +312,9 @@ class TestSparseAdamOp(unittest.TestCase): actual = np.array(out_var) actual = actual.reshape([actual.size]) np_array = np_array.reshape([np_array.size]) - for idx, row_id in enumerate(self.rows): - j = 0 - while j < self.row_numel: - pos = row_id * self.row_numel + j - self.assertLess((actual[pos] - np_array[pos]) / actual[pos], - 0.00001) - j += 1 + + for i in range(np_array.size): + self.assertLess((actual[i] - np_array[i]), 0.00001) def test_sparse_adam(self): places = [core.CPUPlace()] -- GitLab From 64a90b2f1c762dc4a093da413b9c945c99b82e73 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 12:29:22 +0000 Subject: [PATCH 0292/2367] use vadd, vaddrelu, lstm and gru jitkernel --- paddle/fluid/operators/fused/fusion_gru_op.cc | 58 ++++++++--------- .../fluid/operators/fused/fusion_lstm_op.cc | 62 ++++++++++--------- paddle/fluid/operators/math/CMakeLists.txt | 9 --- paddle/fluid/operators/math/fc_compute.h | 14 ++--- 4 files changed, 68 insertions(+), 75 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 25b7ae7c282..d44a7ad83e8 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_gru_op.h" #include // for memcpy #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -183,27 +183,29 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const math::jitkernel::gru_attr_t attr( \ - D, ctx.Attr("gate_activation"), \ - ctx.Attr("activation")); \ - math::jitkernel::gru_t one_step; \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const math::jitkernel::gru_attr_t&>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::Get(attr); \ + auto ComputeHtPart1 = \ + jit::Get(attr); \ + auto ComputeHtPart2 = \ + jit::Get(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { @@ -242,7 +244,7 @@ class FusionGRUKernel : public framework::OpKernel { } else { one_step.gates = xx_data; one_step.ht = hidden_out_data; - ker->ComputeH1(&one_step, &attr); + ComputeH1(&one_step, &attr); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -255,12 +257,12 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = xx_data; one_step.ht_1 = prev_hidden_data; one_step.ht = hidden_out_data; - ker->ComputeHtPart1(&one_step, &attr); + ComputeHtPart1(&one_step, &attr); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - ker->ComputeHtPart2(&one_step, &attr); + ComputeHtPart2(&one_step, &attr); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -324,7 +326,7 @@ class FusionGRUKernel : public framework::OpKernel { for (int i = 0; i < max_bs; ++i) { one_step.gates = cur_in_data; one_step.ht = cur_out_data; - ker->ComputeH1(&one_step, &attr); + ComputeH1(&one_step, &attr); // add offset cur_in_data += D3; cur_out_data += D; @@ -352,7 +354,7 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = cur_batched_data; one_step.ht_1 = cur_prev_hidden_data; one_step.ht = cur_out_data; - ker->ComputeHtPart1(&one_step, &attr); + ComputeHtPart1(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; @@ -370,7 +372,7 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = cur_batched_data; one_step.ht_1 = cur_prev_hidden_data; one_step.ht = cur_out_data; - ker->ComputeHtPart2(&one_step, &attr); + ComputeHtPart2(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 8021a896cea..a62f4d18c2b 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -236,31 +236,33 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const math::jitkernel::lstm_attr_t attr( \ - D, ctx.Attr("gate_activation"), \ - ctx.Attr("candidate_activation"), \ - ctx.Attr("cell_activation"), use_peepholes); \ - math::jitkernel::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const math::jitkernel::lstm_attr_t&>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit \ + : lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + math::jitkernel::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::Get(attr); \ + auto ComputeCtHt = \ + jit::Get(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -306,7 +308,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.gates = xx_data; one_step.ct = c_out_data; one_step.ht = h_out_data; - ker->ComputeC1H1(&one_step, &attr); + ComputeC1H1(&one_step, &attr); tstart = 1; // move one step prev_h_data = h_out_data; @@ -322,7 +324,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = prev_c_data; one_step.ct = c_out_data; one_step.ht = h_out_data; - ker->ComputeCtHt(&one_step, &attr); + ComputeCtHt(&one_step, &attr); // move one step prev_h_data = h_out_data; prev_c_data = c_out_data; @@ -402,7 +404,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.gates = cur_in_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ker->ComputeC1H1(&one_step, &attr); + ComputeC1H1(&one_step, &attr); cur_in_data += D4; cur_c_out_data += D; @@ -432,7 +434,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = cur_prev_c_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ker->ComputeCtHt(&one_step, &attr); + ComputeC1H1(&one_step, &attr); // move one batch cur_in_data += D4; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 8e8f83a6353..ea6aebd291e 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -73,12 +73,3 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) - -# set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) -# set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) -# if(WITH_XBYAK) -# list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) -# list(APPEND JIT_KERNEL_DEPS xbyak) -# endif() -# cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) -# cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 5b9953a5aa9..5e3093c69d3 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/jit_kernel.h" namespace paddle { namespace operators { @@ -30,22 +30,20 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - const auto& vaddrelu = jitkernel::KernelPool::Instance() - .template Get>(N); + auto compute = + jit::Get(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; - vaddrelu->Compute(B, dst, dst, N); + compute(B, dst, dst, N); } } else { - const auto& vadd = jitkernel::KernelPool::Instance() - .template Get>(N); - + auto compute = jit::Get(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; - vadd->Compute(B, dst, dst, N); + compute(B, dst, dst, N); } } } -- GitLab From 720b55cbcff16832671873e409b5aa41b1cec1ef Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 12:30:18 +0000 Subject: [PATCH 0293/2367] enable crf decoding and layer norm refer code --- paddle/fluid/operators/crf_decoding_op.h | 9 +-- paddle/fluid/operators/jit/helper.cc | 4 + paddle/fluid/operators/jit/kernel_base.h | 19 ++++- .../fluid/operators/jit/refer/CMakeLists.txt | 2 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 80 +++++++++++++++++++ paddle/fluid/operators/jit/test.cc | 2 +- paddle/fluid/operators/layer_norm_op.h | 14 ++-- 8 files changed, 119 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index e9d2e84a434..860d71e1fe6 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -82,10 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - const auto& ker = math::jitkernel::KernelPool::Instance() - .template Get>( - static_cast(tag_num)); - ker->Compute(static_cast(seq_len), x, w, alpha_value, track_value); + auto ker = jit::Get( + tag_num); + ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) { diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 0543b0743c0..a0ff82043fc 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -42,6 +42,8 @@ const char* to_string(KernelType kt) { ONE_CASE(gruh1); ONE_CASE(gruhtpart1); ONE_CASE(gruhtpart2); + ONE_CASE(crfdecoding); + ONE_CASE(layernorm); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; @@ -64,6 +66,8 @@ KernelType to_kerneltype(const std::string& act) { } else if (lower == "tanh" || lower == "vtanh") { return vtanh; } + PADDLE_THROW("Not support type: %s, or forget to add this case", act); + return non_kernel; } diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index f10d9f3fdd6..59531c2f17c 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -37,7 +37,9 @@ typedef enum { lstmc1h1, gruh1, gruhtpart1, - gruhtpart2 + gruhtpart2, + crfdecoding, + layernorm } KernelType; template @@ -109,6 +111,21 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +template +struct CRFDecodingTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); +}; + +template +struct LayerNormTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, + const float, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 78d1cb8f9a7..f3a0e9b11f6 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -23,3 +23,5 @@ USE_JITKERNEL_REFER(lstmc1h1) USE_JITKERNEL_REFER(gruh1) USE_JITKERNEL_REFER(gruhtpart1) USE_JITKERNEL_REFER(gruhtpart2) +USE_JITKERNEL_REFER(crfdecoding) +USE_JITKERNEL_REFER(layernorm) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index c99174a66f3..00daa0d4786 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -42,4 +42,7 @@ REGISTER_REFER_KERNEL(gruh1, GRUH1); REGISTER_REFER_KERNEL(gruhtpart1, GRUHtPart1); REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); +REGISTER_REFER_KERNEL(crfdecoding, CRFDecoding); +REGISTER_REFER_KERNEL(layernorm, LayerNorm); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index a9a6ffbccd8..5780ea05bdf 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,6 +13,9 @@ * limitations under the License. */ #pragma once + +#include +#include #include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" @@ -242,6 +245,80 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } } +template +void CRFDecoding(const int seq_len, const T* x, const T* w, T* alpha, + int* track, int right) { + constexpr int state_trans_base_idx = 2; + for (int i = 0; i < right; ++i) { + alpha[i] = w[i] + x[i]; + } + for (int k = 1; k < seq_len; ++k) { + for (int i = 0; i < right; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (int j = 0; j < right; ++j) { + T score = alpha[(k - 1) * right + j] + + w[(j + state_trans_base_idx) * right + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + alpha[k * right + i] = max_score + x[k * right + i]; + track[k * right + i] = max_j; + } + } +} + +template +void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias, + int height, const float epsilon, int right) { + // get mean + for (int i = 0; i < height; i++) { + T sum = 0.0; + int offset = i * right; + for (int j = 0; j < right; j++) { + sum += x[offset + j]; + } + mean[i] = sum / right; + } + + // get variance + for (int i = 0; i < height; i++) { + T sum = 0.0; + int offset = i * right; + for (int j = 0; j < right; j++) { + sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]); + } + var[i] = sum / right; + } + + for (int i = 0; i < height; i++) { + int offset = i * right; + T sqrt_var = std::sqrt(var[i] + (T)epsilon); + for (int j = 0; j < right; j++) { + out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var; + } + } + if (scale) { + for (int i = 0; i < height; i++) { + int offset = i * right; + for (int j = 0; j < right; j++) { + out[offset + j] *= scale[j]; + } + } + } + + if (bias) { + for (int i = 0; i < height; i++) { + int offset = i * right; + for (int j = 0; j < right; j++) { + out[offset + j] += bias[j]; + } + } + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -275,6 +352,9 @@ DECLARE_REFER_KERNEL(GRUH1, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); +DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 36f8eb6e7b6..85eadea7516 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -515,7 +515,7 @@ TEST(JITKernel, gruhtpart2) { TestGRUKernel(); } -// TODO(TJ): refine the tests template +// TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, pool) { // TODO(TJ): add some test diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 78d20ddf5fd..bb00ed47293 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #endif #include "paddle/fluid/operators/math/math_function.h" @@ -229,12 +229,12 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - const auto& ker = math::jitkernel::KernelPool::Instance() - .template Get>( - static_cast(right)); - ker->Compute(x.data(), out.data(), mean->data(), var->data(), - scale->data(), bias->data(), static_cast(left), - static_cast(epsilon)); + auto ker = + jit::Get( + right); + ker(x.data(), out.data(), mean->data(), var->data(), + scale->data(), bias->data(), static_cast(left), + static_cast(epsilon), right); #endif } }; -- GitLab From 2dd55b873fcad8fb7e06963d6ea08ba17e7ce1b7 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Mon, 17 Dec 2018 13:08:02 +0000 Subject: [PATCH 0294/2367] Add shuffle_channel_op --- paddle/fluid/operators/shuffle_channel_op.cc | 126 +++++++++++ paddle/fluid/operators/shuffle_channel_op.cu | 24 ++ paddle/fluid/operators/shuffle_channel_op.h | 101 +++++++++ python/paddle/fluid/layers/nn.py | 213 ++++++------------ .../fluid/tests/unittests/test_layers.py | 9 + .../unittests/test_shuffle_channel_op.py | 54 +++++ 6 files changed, 385 insertions(+), 142 deletions(-) create mode 100644 paddle/fluid/operators/shuffle_channel_op.cc create mode 100644 paddle/fluid/operators/shuffle_channel_op.cu create mode 100644 paddle/fluid/operators/shuffle_channel_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc new file mode 100644 index 00000000000..ec1255af168 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -0,0 +1,126 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace paddle { +namespace operators { + +class ShuffleChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx - > HasInput("X"), + "Input(X) of ShuffleChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Output(Out) of ShuffleChannelOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + // ENFORCE group + auto group = ctx->Attrs().Get>("group"); + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace()); + } +}; + +class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), " + "the input feature data of ShuffleChannelOp, the layout is NCHW."); + AddOutput("Out", + "(Tensor, default Tensor), the output of " + "ShuffleChannelOp. The layout is NCHW."); + AddAttr("group", "the number of groups.") + .SetDefault(1) + .AddCustomChecker([](const int& group) { + PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0."); + }); + + AddComment(R"DOC( + Shuffle Channel operator + This operator obtains the group convolutional layer with channels shuffled. + First, divide the input channels in each group into several subgroups, + then, feed each group in the next layer with different subgroups. + + According to the paper, "Suppose a convolution layer with g groups + whose output has g x n channels, first reshape the output channel dimension into(g,n), + transposing and then flattening it back as the input of next layer. " + + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + + please get more information from the following paper: + https://arxiv.org/pdf/1707.01083.pdf + )DOC"); + } +}; + +// Grad + +class ShuffleChannelOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null") + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto input_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +// how to write gpu kernal +namespace ops = paddle::operators; +REGISTER_OPERATOR(shufflechannel, ops::ShuffleChannelOp, + ops::ShuffleChannelOpMaker, + paddle::framework::DefaultGradOpDescMaker); +// paddle::framework::EmptyGradOpMaker); + +REGISTER_OPERATOR(shufflechannel_grad, ops::ShuffleChannelGradOp); + +REGISTER_OP_CPU_KERNEL( + shufflechannel, + ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpKernel); + +REGISTER_OP_CPU_KERNEL( + shufflechannel_grad, + ops::ShuffleChannelGradOpKernel, + ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu new file mode 100644 index 00000000000..b1eacd0cbe4 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + shufflechannel, + ops::ShuffleChannelOpKernel + ops::ShuffleChannelOpKernel); +REGISTER_OP_CUDA_KERNEL( + shufflechannel_grad, + ops::ShuffleChannelOpGradKernel + ops::ShuffleChannelOpGradKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h new file mode 100644 index 00000000000..f923babf5b8 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class ShuffleChannelOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto group = ctx.Input("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channels / group_row; + + const T* input_data = input->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + + for (int n = 0; n < num; ++n) { + output_data_temp = output_data + n * feature_map_size; + input_data_temp = input_data + n * feature_map_size; + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const auto* p_i = input_data_temp + (i * group_column + j) * sp_sz; + auto* p_o = output_data_temp + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + } + } + } + return; + } +}; + +template +class ShuffleChannelGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto group = ctx.Input("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channels / group_row; + + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + + for (int n = 0; n < num; ++n) { + output_grad_temp = output_grad_data + n * feature_map_size; + input_grad_temp = input_grad_data + n * feature_map_size; + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const auto* p_i = output_grad_temp + (i * group_column + j) * sp_sz; + auto* p_o = input_grad_temp + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + } + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fda..5e1b6c999bc 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -31,148 +31,37 @@ from functools import reduce from .. import core __all__ = [ - 'fc', - 'embedding', - 'dynamic_lstm', - 'dynamic_lstmp', - 'dynamic_gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'cos_sim', - 'cross_entropy', - 'bpr_loss', - 'square_error_cost', - 'chunk_eval', - 'sequence_conv', - 'conv2d', - 'conv3d', - 'sequence_pool', - 'sequence_softmax', - 'softmax', - 'pool2d', - 'pool3d', - 'batch_norm', - 'beam_search_decode', - 'conv2d_transpose', - 'conv3d_transpose', - 'sequence_expand', - 'sequence_expand_as', - 'sequence_pad', - 'sequence_unpad', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'sequence_first_step', - 'sequence_last_step', - 'sequence_slice', - 'dropout', - 'split', - 'ctc_greedy_decoder', - 'edit_distance', - 'l2_normalize', - 'matmul', - 'topk', - 'warpctc', - 'sequence_reshape', - 'transpose', - 'im2sequence', - 'nce', - 'hsigmoid', - 'beam_search', - 'row_conv', - 'multiplex', - 'layer_norm', - 'group_norm', - 'softmax_with_cross_entropy', - 'smooth_l1', - 'one_hot', - 'autoincreased_step_counter', - 'reshape', - 'squeeze', - 'unsqueeze', - 'lod_reset', - 'lrn', - 'pad', - 'pad_constant_like', - 'label_smooth', - 'roi_pool', - 'roi_align', - 'dice_loss', - 'image_resize', - 'image_resize_short', - 'resize_bilinear', - 'resize_nearest', - 'gather', - 'scatter', - 'sequence_scatter', - 'random_crop', - 'mean_iou', - 'relu', - 'selu', - 'log', - 'crop', - 'rank_loss', - 'margin_rank_loss', - 'elu', - 'relu6', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', - 'prelu', - 'brelu', - 'leaky_relu', - 'soft_relu', - 'flatten', - 'sequence_mask', - 'stack', - 'pad2d', - 'unstack', - 'sequence_enumerate', - 'expand', - 'sequence_concat', - 'scale', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'sampling_id', - 'gaussian_random_batch_size_like', - 'sum', - 'slice', - 'shape', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'clip', - 'clip_by_norm', - 'mean', - 'mul', - 'sigmoid_cross_entropy_with_logits', - 'maxout', - 'space_to_depth', - 'affine_grid', - 'sequence_reverse', - 'affine_channel', - 'similarity_focus', - 'hash', - 'grid_sampler', - 'log_loss', - 'add_position_encoding', - 'bilinear_tensor_product', - 'merge_selected_rows', - 'get_tensor_from_selected_rows', - 'lstm', + 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', + 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', + 'bpr_loss', 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', + 'conv3d', 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', + 'pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', + 'conv3d_transpose', 'sequence_expand', 'sequence_expand_as', 'sequence_pad', + 'sequence_unpad', 'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', + 'reduce_min', 'reduce_prod', 'sequence_first_step', 'sequence_last_step', + 'sequence_slice', 'dropout', 'split', 'ctc_greedy_decoder', 'edit_distance', + 'l2_normalize', 'matmul', 'topk', 'warpctc', 'sequence_reshape', + 'transpose', 'im2sequence', 'nce', 'hsigmoid', 'beam_search', 'row_conv', + 'multiplex', 'layer_norm', 'group_norm', 'softmax_with_cross_entropy', + 'smooth_l1', 'one_hot', 'autoincreased_step_counter', 'reshape', 'squeeze', + 'unsqueeze', 'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', + 'roi_pool', 'roi_align', 'dice_loss', 'image_resize', 'image_resize_short', + 'resize_bilinear', 'resize_nearest', 'gather', 'scatter', + 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', 'selu', 'log', + 'crop', 'rank_loss', 'margin_rank_loss', 'elu', 'relu6', 'pow', 'stanh', + 'hard_sigmoid', 'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', + 'flatten', 'sequence_mask', 'stack', 'pad2d', 'unstack', + 'sequence_enumerate', 'expand', 'sequence_concat', 'scale', + 'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul', + 'elementwise_max', 'elementwise_min', 'elementwise_pow', + 'uniform_random_batch_size_like', 'gaussian_random', 'sampling_id', + 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape', 'logical_and', + 'logical_or', 'logical_xor', 'logical_not', 'clip', 'clip_by_norm', 'mean', + 'mul', 'sigmoid_cross_entropy_with_logits', 'maxout', 'space_to_depth', + 'affine_grid', 'sequence_reverse', 'affine_channel', 'similarity_focus', + 'hash', 'grid_sampler', 'log_loss', 'add_position_encoding', + 'bilinear_tensor_product', 'merge_selected_rows', + 'get_tensor_from_selected_rows', 'lstm', 'shufflechannel' ] kIgnoreIndex = -100 @@ -9122,3 +9011,43 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +def shuffle_channel(x, group=1, name=None): + """ + **Shuffle Channel Operator** + This operator obtains the group convolutional layer with channels shuffled. + First, divide the input channels in each group into several subgroups, + then, feed each group in the next layer with different subgroups. + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + + Args: + x: The input tensor variable. + + + Returns: + Variable: channel shuffled tensor variable. + + Raises: + ValueError: If group in not a int type variable. + + Examples: + .. code-block:: python + + + """ + helper = LayerHelper("shuffle_channel", **locals()) + + out = helper.create_variable_for_type_inference( + dtype=helper.intput_dtype('x')) + + if not isinstance(group, int): + raise TypeError("group must be int type") + + helper.append_op( + type="shuffle_channel", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"group": group}) + return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a866..155f59f6fea 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -982,6 +982,15 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_shuffle_channel(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[10, 32, 16, 16], dtype="float32") + group = layers.data(name="group", shape=[1], dtype="int32") + out = layers.shuffle_channel(x, group) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py new file mode 100644 index 00000000000..25df22193ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py @@ -0,0 +1,54 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import paddle.fluid.core as core + + +class TestShuffleChannelOp(OpTest): + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'output') + + def setUp(self): + self.op_type = "shuffle_channel" + self.batch_size = 10 + self.input_channels = 16 + self.layer_h = 32 + self.layer_w = 32 + self.group = 4 + + self.x = np.random.random( + (self.batch_size, self.input_channels, self.layer_h, self, + layer_w)).astype('float32') + self.inputs = {'X': self.x} + self.attrs = {'group': self.group} + + n, c, h, w = self.x.shape + input_reshaped = np.reshape(self.x, + (-1, self.group, c // self.group, h, w)) + input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) + self.outputs = np.reshape(input_transposed, (-1, c, h, w)) + + +if __name__ == '__main__': + unittest.main() -- GitLab From 050a68dde38611b226207ae5840e8009ff44bc2a Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 17 Dec 2018 13:19:59 +0000 Subject: [PATCH 0295/2367] fix comments test=develop --- .../inference/analysis/ir_passes/tensorrt_subgraph_pass.cc | 3 --- paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc | 1 - paddle/fluid/operators/tensorrt/tensorrt_engine_op.h | 2 -- 3 files changed, 6 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 4ffe5f575c2..9c42b83e7ad 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -63,7 +63,6 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); - static int counter{0}; auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, block_desc.Proto()->SerializeAsString()); SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "engine_uniq_key", - "trt-" + std::to_string(counter++)); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index f1ab59e3972..b993c55fad1 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); - AddAttr("engine_uniq_key", "unique key for the TRT engine."); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index c19c315f798..88c4f508474 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -65,7 +65,6 @@ using inference::tensorrt::TensorRTEngine; class TensorRTEngineOp : public framework::OperatorBase { private: - std::string engine_name_; std::vector input_names_; std::unordered_set param_names_; mutable std::unique_ptr trt_engine_; @@ -78,7 +77,6 @@ class TensorRTEngineOp : public framework::OperatorBase { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : framework::OperatorBase(type, inputs, outputs, attrs) { - engine_name_ = Attr("engine_uniq_key"); input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); -- GitLab From 2373aeb5e845823ef30b0bfd303e93aa00798295 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 17 Dec 2018 21:41:08 +0800 Subject: [PATCH 0296/2367] fix bug test=develop --- paddle/fluid/platform/stream_callback_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 466c77469ef..5a9e24374f6 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -19,7 +19,7 @@ namespace paddle { namespace platform { #if CUDA_VERSION >= 10000 -static void CUDART_CB StreamCallbackFunc(void *user_data); +static void CUDART_CB StreamCallbackFunc(void *user_data) #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) -- GitLab From fd152289fa694b99704e4821a71b0c1f160896aa Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 22:14:11 +0800 Subject: [PATCH 0297/2367] clean for range in test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 14 +++--- paddle/fluid/platform/for_range.h | 52 --------------------- 2 files changed, 6 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 8fc6689ff1a..4f212bb69a1 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -227,8 +227,10 @@ struct SparseAdamFunctor { inline HOSTDEVICE void operator()(size_t i) const { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; - adam_update(i, g); + if (!(lazy_mode_ && row_idx < 0)) { + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + adam_update(i, g); + } } }; @@ -359,19 +361,15 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); VLOG(3) << "lazy_mode :" << lazy_mode; - if (lazy_mode) { - std::vector id_vector; + if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) { size_t row_count = grad_merge.rows().size(); std::vector cpu_rows(grad_merge.rows()); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = cpu_rows[row_index] * row_numel + offset; - id_vector.push_back(i); + functor.adam_update(i, grad_data[row_index * row_numel + offset]); } } - platform::ForRangeIn for_range_in( - static_cast(ctx.device_context()), id_vector); - for_range_in(functor); } else { platform::ForRange for_range( static_cast(ctx.device_context()), diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index ab00d8b8f57..910d1669f23 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -22,29 +22,6 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct ForRangeIn { - ForRangeIn(const DeviceContext& dev_ctx, std::vector range); - - template - void operator()(Function func) const; -}; - -template <> -struct ForRangeIn { - ForRangeIn(const CPUDeviceContext& dev_ctx, std::vector range) - : range_(range) {} - - template - void operator()(Function func) const { - for (auto i : range_) { - func(i); - } - } - - std::vector range_; -}; - template struct ForRange { ForRange(const DeviceContext& dev_ctx, size_t limit); @@ -106,35 +83,6 @@ struct ForRange { int limit_; }; -template -__global__ static void ForRangeInElemwiseOp(Function func, T* vector, - int vector_size) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (idx < vector_size) { - func(vector[idx]); - } -} - -template <> -struct ForRangeIn { - ForRangeIn(const CUDADeviceContext& dev_ctx, std::vector range) - : dev_ctx_(dev_ctx), range_(range) {} - - template - inline void operator()(Function func) const { - constexpr int num_threads = 1024; - int range_size = range_.size(); - int block_size = range_size <= num_threads ? range_size : num_threads; - int grid_size = (range_.size() + num_threads - 1) / num_threads; - - ForRangeInElemwiseOp<<>>( - func, range_.CUDAData(dev_ctx_.GetPlace()), range_size); - } - - const CUDADeviceContext& dev_ctx_; - framework::Vector range_; -}; - #endif } // namespace platform -- GitLab From 56686d0f34db008238095331b6f981d8f4e5d3d4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 22:16:52 +0800 Subject: [PATCH 0298/2367] clean code test=develop --- paddle/fluid/platform/for_range.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 910d1669f23..c153e80fe42 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include - -#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { -- GitLab From 74292f414c033bf7cb53b4f87a82f7ff6c18a4b2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 14:51:52 +0000 Subject: [PATCH 0299/2367] enable eltwise nchw16c mul nc --- .../elementwise/elementwise_mul_mkldnn_op.cc | 10 ++- paddle/fluid/operators/jit/gen/CMakeLists.txt | 1 + paddle/fluid/operators/jit/gen/blas.cc | 43 +++++++++++++ paddle/fluid/operators/jit/gen/blas.h | 12 ++++ paddle/fluid/operators/jit/helper.cc | 3 +- paddle/fluid/operators/jit/helper.h | 1 + paddle/fluid/operators/jit/kernel_base.h | 11 +++- .../fluid/operators/jit/refer/CMakeLists.txt | 1 + paddle/fluid/operators/jit/refer/refer.cc | 2 + paddle/fluid/operators/jit/refer/refer.h | 15 +++++ paddle/fluid/operators/jit/test.cc | 62 +++++++++++++++++++ 11 files changed, 153 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d76..71f4b71330a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" @@ -108,10 +108,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - const auto& multiply = - math::jitkernel::KernelPool::Instance() - .template Get>(n); - + auto multiply = jit::Get(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { @@ -122,7 +120,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { auto ptr_z = z_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - multiply->Compute(ptr_x, ptr_y, ptr_z, h, w); + multiply(ptr_x, ptr_y, ptr_z, h, w); } } } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 8ad9587b5ef..a7f9e18318d 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -25,3 +25,4 @@ USE_JITKERNEL_GEN(lstmc1h1) USE_JITKERNEL_GEN(gruh1) USE_JITKERNEL_GEN(gruhtpart1) USE_JITKERNEL_GEN(gruhtpart2) +USE_JITKERNEL_GEN(nchw16cmulnc) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index b24f44c9f3b..65b9a52ff2d 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -104,6 +104,48 @@ void VXXJitCode::genCode() { ret(); } +void NCHW16CMulNCJitCode::genCode() { + // RDI is ptr x_input + // RSI is ptr y_input + // RDX is ptr output + // RCX is height + // r8 is width + + push(rbx); + + xor_(rax, rax); + xor_(r10, r10); + vmovups(zmm3, ptr[rsi]); + + L("h_loop"); + xor_(rbx, rbx); + L("w_loop"); + vmovups(zmm2, ptr[rdi + rax]); + vmulps(zmm1, zmm2, zmm3); + vmovups(ptr[rdx + rax], zmm1); + add(rax, 64); + inc(rbx); + cmp(r8, rbx); + jnz("w_loop"); + inc(r10); + cmp(r10, rcx); + jnz("h_loop"); + + pop(rbx); + ret(); +} + +class NCHW16CMulNCCreator : public JitCodeCreator { + public: + bool UseMe(const int& attr) const override { + return platform::MayIUse(platform::avx512f); + } + size_t CodeSize(const int& d) const override { return 256 * 1024; } + std::unique_ptr CreateJitCode(const int& attr) const override { + return make_unique(attr, CodeSize(attr)); + } +}; + #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ @@ -141,3 +183,4 @@ REGISTER_JITKERNEL_GEN(vadd, gen::VAddCreator); REGISTER_JITKERNEL_GEN(vaddrelu, gen::VAddReluCreator); REGISTER_JITKERNEL_GEN(vscal, gen::VScalCreator); REGISTER_JITKERNEL_GEN(vaddbias, gen::VAddBiasCreator); +REGISTER_JITKERNEL_GEN(nchw16cmulnc, gen::NCHW16CMulNCCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index 5a2192052f8..29be4e73589 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -99,6 +99,18 @@ DECLARE_BLAS_JITCODE(VAddBias, operand_type::add, 1, false); #undef DECLARE_BLAS_JITCODE +// nChw16c = nChw16c .* NC +class NCHW16CMulNCJitCode : public JitCode { + public: + DECLARE_JIT_CODE(NCHW16CMulNCJitCode); + explicit NCHW16CMulNCJitCode(int d /*unused*/, size_t code_size, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr) { + this->genCode(); + } + void genCode() override; +}; + } // namespace gen } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index a0ff82043fc..a1bb51fa666 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -44,8 +44,9 @@ const char* to_string(KernelType kt) { ONE_CASE(gruhtpart2); ONE_CASE(crfdecoding); ONE_CASE(layernorm); + ONE_CASE(nchw16cmulnc); default: - PADDLE_THROW("Not support type: %d", kt); + PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; } return nullptr; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 44952fb9079..275170ca2b5 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -93,6 +93,7 @@ inline typename KernelTuples::func_type GetRefer() { template +// TODO(TJ): const & attr typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 59531c2f17c..9ba0a958313 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -39,7 +39,8 @@ typedef enum { gruhtpart1, gruhtpart2, crfdecoding, - layernorm + layernorm, + nchw16cmulnc, } KernelType; template @@ -126,6 +127,14 @@ struct LayerNormTuples { const float, int); }; +// nChw16c = nChw16c .* NC +template +struct NCHW16CMulNCTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, const T*, T*, int, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index f3a0e9b11f6..86432bfffe7 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -25,3 +25,4 @@ USE_JITKERNEL_REFER(gruhtpart1) USE_JITKERNEL_REFER(gruhtpart2) USE_JITKERNEL_REFER(crfdecoding) USE_JITKERNEL_REFER(layernorm) +USE_JITKERNEL_REFER(nchw16cmulnc) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 00daa0d4786..1aee6ff9500 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -45,4 +45,6 @@ REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); REGISTER_REFER_KERNEL(crfdecoding, CRFDecoding); REGISTER_REFER_KERNEL(layernorm, LayerNorm); +REGISTER_REFER_KERNEL(nchw16cmulnc, NCHW16CMulNC); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 5780ea05bdf..6f72c2b724b 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -319,6 +319,19 @@ void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias, } } +template +void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { + int offset = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int i = 0; i < 16; ++i) { + z[i + offset] = y[i] * x[i + offset]; + } + offset += ZMM_FLOAT_BLOCK; + } + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -355,6 +368,8 @@ DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); +DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 85eadea7516..32937d9c005 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,6 +19,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" template @@ -414,6 +415,59 @@ void TestGRUKernel() { } } +template +void TestNCHW16CMulNCKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data(), -2.f, 2.f); + RandomVec(n * c, y.data(), -2.f, 2.f); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::Get, PlaceType>(0); + auto jitcode = jit::GetJitCode, PlaceType>(0); + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + } + } + } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + // XYZNTuple TEST(JITKernel, vmul) { namespace jit = paddle::operators::jit; @@ -515,6 +569,14 @@ TEST(JITKernel, gruhtpart2) { TestGRUKernel(); } +TEST(JITKernel, nchw16cmulnc) { + namespace jit = paddle::operators::jit; + TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); +} + // TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, pool) { -- GitLab From fe3995d33527e8503739b6de3dd555fa3ad35073 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 07:15:42 +0800 Subject: [PATCH 0300/2367] refine code test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 4f212bb69a1..f214d8272f5 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -227,7 +227,9 @@ struct SparseAdamFunctor { inline HOSTDEVICE void operator()(size_t i) const { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); - if (!(lazy_mode_ && row_idx < 0)) { + if (lazy_mode_ && row_idx < 0) { + return; + } else { T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; adam_update(i, g); } -- GitLab From 59cf96ec18ed73ae97b91ab233d2270cbb42a905 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 09:33:10 +0800 Subject: [PATCH 0301/2367] add log --- paddle/fluid/operators/optimizers/adam_op.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index aabb71c556a..7dd5a8783a5 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -357,6 +357,9 @@ class AdamOpKernel : public framework::OpKernel { if (inner_op_parallelism > 1 && FLAGS_min_param_size_to_use_multithread > 0 && param.numel() > FLAGS_min_param_size_to_use_multithread) { + VLOG(3) << "use multi thread, inner_op_parallelism=" + << inner_op_parallelism << " min_param_size_to_use_multithread" + << FLAGS_min_param_size_to_use_multithread; std::vector> fs; int64_t block_size = param.numel() / inner_op_parallelism; for (int i = 0; i < inner_op_parallelism; ++i) { -- GitLab From 52bc4ee75adf64e449dfdbbdbbe3e41cdc593bdc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 20:27:17 +0800 Subject: [PATCH 0302/2367] delay infer scope test=develop --- paddle/fluid/framework/operator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a62afe248ba..86e1713b021 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -703,8 +703,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -758,6 +756,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); if (!transfered_inplace_vars.empty()) { -- GitLab From bbff0df320f0f68634a5ae3c4d9507b52a1134f7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 21:49:25 +0800 Subject: [PATCH 0303/2367] try cache variables test=develop --- paddle/fluid/framework/ngraph_operator.cc | 15 +++++++- paddle/fluid/framework/operator.cc | 47 ++++++++++++++++------- paddle/fluid/framework/operator.h | 22 ++++++++--- paddle/fluid/framework/type_defs.h | 3 ++ 4 files changed, 66 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e2cdfc845fe..e37f0915c5d 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,7 +278,20 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - op->RuntimeInferShape(scope_, place_); + RuntimeContext ctx; + for (auto& var_name_item : op->Inputs()) { + std::vector input_vars = ctx.inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope_.FindVar(var_name)); + } + } + for (auto& var_name_item : op->Outputs()) { + std::vector output_vars = ctx.outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope_.FindVar(var_name)); + } + } + op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 86e1713b021..79e3d29a63b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -477,23 +477,22 @@ bool OpSupportGPU(const std::string& op_type) { class RuntimeInferShapeContext : public InferShapeContext { public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), scope_(scope), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input - const auto& ins = op_.Inputs(); + const auto& ins = ctx_.inputs; auto it = ins.find(name); if (it == ins.end()) { return false; } const auto& in = it->second; - if (in.size() == 0 || in[0] == kEmptyVarName) { - return false; - } + if (in.size() == 0) return false; PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - return scope_.FindVar(in[0]) != nullptr; + return in[0] != nullptr; } bool HasOutput(const std::string& name) const override { @@ -678,6 +677,7 @@ class RuntimeInferShapeContext : public InferShapeContext { private: const OperatorBase& op_; const Scope& scope_; + const RuntimeContext& ctx_; }; static void CheckTensorNANOrInf(const std::string& name, @@ -696,8 +696,9 @@ static void CheckTensorNANOrInf(const std::string& name, } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, - const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); this->InferShape(&infer_shape_ctx); } @@ -743,10 +744,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } + RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -756,7 +758,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); @@ -797,13 +799,20 @@ void OperatorWithKernel::TransferInplaceVarsBack( } } -Scope* OperatorWithKernel::TryTransferData( +Scope* OperatorWithKernel::PrepareData( const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const { + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { - for (auto& var_name : var_name_item.second) { + std::vector& input_vars = ctx->inputs[var_name_item.first]; + input_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); + input_vars[i] = var; + // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { continue; @@ -851,12 +860,22 @@ Scope* OperatorWithKernel::TryTransferData( } auto* trans_var = new_scope->Var(var_name); + input_vars[i] = var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); SetTensorToVariable(*var, out, trans_var); } } + for (auto& var_name_item : Outputs()) { + std::vector& output_vars = ctx->outputs[var_name_item.first]; + output_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; + output_vars[i] = scope.FindVar(var_name); + } + } return new_scope; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce..438ae253987 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,6 +70,14 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); class OperatorBase; class ExecutionContext; +class RuntimeContext { + public: + RuntimeContext() {} + + VariableValueMap inputs; + VariableValueMap outputs; +}; + /** * OperatorBase has the basic elements that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -129,7 +137,8 @@ class OperatorBase { void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } virtual void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const {} + const platform::Place& place, + const RuntimeContext& ctx) const {} protected: std::string type_; @@ -350,8 +359,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; @@ -371,9 +380,10 @@ class OperatorWithKernel : public OperatorBase { * * * transfered_inplace_vars is a output vector. */ - Scope* TryTransferData( - const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const; + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e0..938e2024c33 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -28,8 +28,11 @@ class OperatorBase; class OpDesc; class InferShapeContext; class BlockDesc; +class Variable; using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; // The order should be as same as framework.proto using Attribute = -- GitLab From 840e6729e224d867386bdfc9ff12af4b71ee7188 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 17 Dec 2018 21:27:56 +0800 Subject: [PATCH 0304/2367] inject context test=develop --- paddle/fluid/framework/ngraph_operator.cc | 14 +------- paddle/fluid/framework/operator.cc | 36 +++++++++++-------- paddle/fluid/framework/operator.h | 9 +++-- .../fluid/operators/beam_search_decode_op.cc | 3 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e37f0915c5d..23f681ce886 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,19 +278,7 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx; - for (auto& var_name_item : op->Inputs()) { - std::vector input_vars = ctx.inputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - input_vars.push_back(scope_.FindVar(var_name)); - } - } - for (auto& var_name_item : op->Outputs()) { - std::vector output_vars = ctx.outputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - output_vars.push_back(scope_.FindVar(var_name)); - } - } + RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 79e3d29a63b..461d3575274 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } } +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { @@ -704,6 +721,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { + RuntimeContext ctx(Inputs(), Outputs(), scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -717,15 +735,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. - - // for (auto& candidate : kKernelPriority) { - // Do selection - // } - - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -744,7 +755,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } - RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = @@ -760,7 +770,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -784,6 +794,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -806,7 +817,6 @@ Scope* OperatorWithKernel::PrepareData( Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { std::vector& input_vars = ctx->inputs[var_name_item.first]; - input_vars.resize(var_name_item.second.size()); for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; @@ -869,8 +879,6 @@ Scope* OperatorWithKernel::PrepareData( } for (auto& var_name_item : Outputs()) { std::vector& output_vars = ctx->outputs[var_name_item.first]; - output_vars.resize(var_name_item.second.size()); - for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 438ae253987..e359414d151 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -72,7 +72,8 @@ class ExecutionContext; class RuntimeContext { public: - RuntimeContext() {} + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); VariableValueMap inputs; VariableValueMap outputs; @@ -165,8 +166,9 @@ class OperatorBase { class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext& device_context) - : op_(op), scope_(scope), device_context_(device_context) {} + const platform::DeviceContext& device_context, + const RuntimeContext& ctx) + : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} const OperatorBase& op() const { return op_; } @@ -295,6 +297,7 @@ class ExecutionContext { const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; }; template <> diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index ae9765b7613..7f2bde55c98 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); - framework::ExecutionContext ctx(*this, scope, dev_ctx); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); -- GitLab From eaf8ba35b519b780629a7108d08ffd3895ac18fe Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 09:42:57 +0800 Subject: [PATCH 0305/2367] change input test=develop --- paddle/fluid/framework/operator.cc | 50 ++++++++++++++++++++++++++++++ paddle/fluid/framework/operator.h | 33 +++++++++++++++----- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 461d3575274..87f61f3afc3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,12 +143,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -429,11 +431,52 @@ bool ExecutionContext::HasOutput(const std::string& name) const { return var != nullptr; } +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + +const Variable* ExecutionContext::FastInputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::FastOutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const { + return FastInput(name); +} + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -458,6 +501,11 @@ Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const { + return FastOutput(name); +} + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -822,6 +870,7 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; + LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -882,6 +931,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); + LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e359414d151..0aad91dbeef 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,15 +191,9 @@ class ExecutionContext { return op_.Outputs(name).size(); } - const Variable* InputVar(const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - } + const Variable* InputVar(const std::string& name) const; - Variable* OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); - } + Variable* OutputVar(const std::string& name) const; const std::vector MultiInputVar( const std::string& name) const { @@ -238,6 +232,22 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } + template + const T* FastInput(const std::string& name) const { + auto* var = FastInputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* FastOutput(const std::string& name) const { + auto var = FastOutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + const Variable* FastInputVar(const std::string& name) const; + + Variable* FastOutputVar(const std::string& name) const; + template const std::vector MultiInput(const std::string& name) const { auto names = op_.Inputs(name); @@ -303,6 +313,10 @@ class ExecutionContext { template <> const Tensor* ExecutionContext::Input(const std::string& name) const; +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const; + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; @@ -310,6 +324,9 @@ const std::vector ExecutionContext::MultiInput( template <> Tensor* ExecutionContext::Output(const std::string& name) const; +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const; + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 62c55c4f557..b6155ed3dd4 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), + return framework::OpKernelType(ctx.FastInput("X")->type(), ctx.device_context()); } }; -- GitLab From 8936c7913b7b25a536470ac2a20999b8744cca5f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 09:58:54 +0800 Subject: [PATCH 0306/2367] add log test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 7dd5a8783a5..5ba5639fd51 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -358,7 +358,7 @@ class AdamOpKernel : public framework::OpKernel { FLAGS_min_param_size_to_use_multithread > 0 && param.numel() > FLAGS_min_param_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" - << inner_op_parallelism << " min_param_size_to_use_multithread" + << inner_op_parallelism << " min_param_size_to_use_multithread=" << FLAGS_min_param_size_to_use_multithread; std::vector> fs; int64_t block_size = param.numel() / inner_op_parallelism; -- GitLab From cbc7208399e980687e7bd51102d3e84907353fba Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 18 Dec 2018 10:15:07 +0800 Subject: [PATCH 0307/2367] fix doc test=develop --- python/paddle/fluid/async_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 3181654feb8..4ca6a5170eb 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -301,7 +301,7 @@ class AsyncExecutor(object): save_model command that can be invoked from one of the worker model parameters are saved in servers and upload to save_path of file system Args: - save_path(str): path to file system + save_path(str): save path to file system """ if self.instance is None: raise ValueError( -- GitLab From c631412eab5371cd72c9a7d2f9830870eb953e46 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 17 Dec 2018 22:02:15 +0800 Subject: [PATCH 0308/2367] fix gc bug test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../tests/unittests/test_eager_deletion_dynamic_rnn_base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8c3912120b5..da9556c6c1f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -100,7 +100,7 @@ static void DeleteUnusedTensors( continue; } auto* var = scope.FindVar(name); - if (var != nullptr) { + if (var == nullptr) { continue; } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index e91cfe0b45a..89476ee641f 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, len(word_dict)) + cost.persistable = True optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer.minimize(cost) -- GitLab From 06936a2ff59ba67f6be0526bf97c26a3cf036b18 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 18 Dec 2018 11:16:14 +0800 Subject: [PATCH 0309/2367] fix 1gpu test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 3 ++- paddle/fluid/framework/details/parallel_ssa_graph_executor.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6bca299813f..4a0347d07a8 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,7 +51,8 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA - // Find NCCL ID from the global scope. + // All-reduce op_handle can run on the sub-scope, find the nccl id from + // the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 845c4379e6f..2377f2c963d 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -59,7 +59,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + fetch_datas.emplace_back(std::move(call())); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 152b9b27025..0042ccaa4f8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (build_strategy.enable_parallel_graph_) { + if (build_strategy.enable_parallel_graph_ && places.size() > 1) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { -- GitLab From 5a6d7fe2ff6c946b2d9fe7816a9ee8a321c1b9fa Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 11:36:42 +0800 Subject: [PATCH 0310/2367] add mkl,ctc support for windows --- CMakeLists.txt | 12 +-- cmake/cuda.cmake | 3 + cmake/cudnn.cmake | 1 + cmake/external/cub.cmake | 2 +- cmake/external/dlpack.cmake | 2 +- cmake/external/mkldnn.cmake | 43 +++++++--- cmake/external/mklml.cmake | 83 +++++++++++-------- cmake/external/python.cmake | 8 +- cmake/external/warpctc.cmake | 30 +++++-- cmake/external/xbyak.cmake | 4 +- cmake/generic.cmake | 6 +- cmake/inference_lib.cmake | 16 ++-- cmake/operators.cmake | 2 +- cmake/simd.cmake | 73 ++++++++-------- paddle/fluid/framework/CMakeLists.txt | 3 +- .../framework/details/all_reduce_op_handle.cc | 2 +- paddle/fluid/framework/mixed_vector.h | 10 +-- paddle/fluid/framework/op_registry.h | 3 +- .../inference/api/demo_ci/CMakeLists.txt | 15 +++- .../fluid/memory/detail/system_allocator.cc | 1 - paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/cum_op.h | 2 + .../elementwise/elementwise_mul_mkldnn_op.cc | 3 + .../operators/math/detail/lstm_cpu_kernel.h | 6 ++ paddle/fluid/operators/math/jit_gen.h | 3 + paddle/fluid/platform/cpu_info.cc | 7 +- paddle/fluid/platform/dynload/CMakeLists.txt | 2 - paddle/fluid/platform/dynload/cudnn.cc | 4 + paddle/fluid/platform/dynload/cudnn.h | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 16 ++++ .../fluid/platform/dynload/dynamic_loader.h | 6 ++ paddle/fluid/platform/dynload/mklml.h | 2 +- paddle/fluid/platform/dynload/tensorrt.h | 2 +- paddle/fluid/platform/dynload/warpctc.h | 2 +- paddle/fluid/platform/port.h | 5 +- paddle/fluid/train/demo/CMakeLists.txt | 18 +++- python/CMakeLists.txt | 16 ++-- python/paddle/fluid/__init__.py | 9 +- python/paddle/fluid/framework.py | 18 ++-- python/setup.py.in | 38 +++++---- 40 files changed, 315 insertions(+), 172 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 653ae4ffe53..efdb451f659 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,16 +125,12 @@ if(ANDROID OR IOS) add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() -if (APPLE OR WIN32) +if (APPLE) set(WITH_MKL OFF CACHE STRING - "Disable MKL for building on mac and windows" FORCE) + "Disable MKL for building on mac" FORCE) endif() if (WIN32) - set(WITH_DSO OFF CACHE STRING - "Disable DSO when compiling for Windows" FORCE) - set(WITH_MKL OFF CACHE STRING - "Disable MKL when compiling for Windows" FORCE) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) set(WITH_C_API OFF CACHE STRING @@ -207,10 +203,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 414e92eb27f..5be7be64137 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,10 +139,12 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) + add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") + add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 09bec347dbd..96a9917e762 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,6 +89,7 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() + add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index c94849cf4b9..f06728de91e 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -32,4 +32,4 @@ endif() add_dependencies(cub extern_cub) -LIST(APPEND externl_project_dependencies cub) +LIST(APPEND external_project_dependencies cub) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 94d8fcc6685..4587475d790 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -28,4 +28,4 @@ endif() add_dependencies(dlpack extern_dlpack) -LIST(APPEND externl_project_dependencies dlpack) +LIST(APPEND external_project_dependencies dlpack) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index b280db23b9b..c29375cd058 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -IF(WIN32 OR APPLE) +IF(APPLE) MESSAGE(WARNING - "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Mac is not supported with MKLDNN in Paddle yet." "Force WITH_MKLDNN=OFF") - SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE) return() ENDIF() -SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") @@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML") ELSE() MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() -SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") -SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") -SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") -SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") + +IF(NOT WIN32) + SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") + SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") + SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ENDIF(NOT WIN32) + ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} @@ -58,8 +61,15 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} @@ -67,6 +77,11 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} -DMKLROOT:PATH=${MKLML_ROOT} ) +if(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) +else(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +endif(WIN32) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) @@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi -SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) -ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} - COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) +if(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll) +else(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) + ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + DEPENDS mkldnn) +endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) IF(WITH_C_API) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index dc5427acd45..3da552e3190 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,59 +16,76 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(WIN32 OR APPLE) +IF(APPLE) MESSAGE(WARNING - "Windows or Mac is not supported with MKLML in Paddle yet." + "Mac is not supported with MKLML in Paddle yet." "Force WITH_MKLML=OFF") SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) return() ENDIF() INCLUDE(ExternalProject) - -SET(MKLML_PROJECT "extern_mklml") -IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") -SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") -SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) -SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) -SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +if(WIN32) + SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +else() + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +endif() SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") -INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) +if(WIN32) + MESSAGE(WARNING + "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml) +else() + SET(MKLML_PROJECT "extern_mklml") + IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + ENDIF() + MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") + SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") + SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") -FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" - " DESTINATION ${MKLML_DST_DIR})\n") + FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" + " DESTINATION ${MKLML_DST_DIR})\n") -ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${MKLML_SOURCE_DIR} - DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz - && tar zxf ${MKLML_VER}.tgz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} -) + ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz + && tar zxf ${MKLML_VER}.tgz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} + ) +endif() + + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) -ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +if(NOT WIN32) + ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +endif() LIST(APPEND external_project_dependencies mklml) IF(WITH_C_API) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798c..edfb655541f 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -23,9 +23,12 @@ FIND_PACKAGE(PythonLibs ${PY_VERSION}) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" -"from distutils import sysconfig as s;import sys;import struct; +"from distutils import sysconfig as s;import sys;import struct;import sysconfig; print(sys.prefix); print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +print(sysconfig.get_platform()); +print(sysconfig.get_config_var('py_version_nodot')); +print(sysconfig.get_config_var('SOABI')); " RESULT_VARIABLE _PYTHON_SUCCESS OUTPUT_VARIABLE _PYTHON_VALUES @@ -41,6 +44,9 @@ print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + list(GET _PYTHON_VALUES 2 SYS_PLATFORM) + list(GET _PYTHON_VALUES 3 PYTHON_SHORT_VERSION_NODOT) + list(GET _PYTHON_VALUES 4 PYTHON_SOABI) # Make sure all directory separators are '/' string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16a..7b937c93feb 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 384c2f93282..42e39fb8134 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -13,8 +13,8 @@ # limitations under the License. set(WITH_XBYAK ON) -if(WIN32 OR APPLE) - SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) +if(APPLE) + SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in MacOS" FORCE) return() endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index a8b9dcfcf5e..c6fe2e970d3 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -267,7 +267,11 @@ function(cc_library TARGET_NAME) list(APPEND cc_library_DEPS dynload_mklml) endif() add_dependencies(${TARGET_NAME} mklml) - target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + if(WIN32) + target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) + else(WIN32) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif(WIN32) endif() # remove link to python, see notes at: # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 5aa7a8a752f..a5b70b3c33f 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32) ) endif () -if (NOT CBLAS_FOUND) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") - copy(openblas_lib - SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir} - DEPS extern_openblas - ) -elseif (WITH_MKLML) +if (WITH_MKLML) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") copy(mklml_lib SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} DEPS mklml ) +elseif (NOT CBLAS_FOUND OR WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") + copy(openblas_lib + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + DEPS extern_openblas + ) endif () if (WITH_MKLDNN) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6c..70d159b4f35 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4feaa..566dc75fda0 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 225dfb3e700..90083f690fe 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -15,8 +15,7 @@ function(windows_symbolic TARGET) file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 9eaff1f5601..de7c845884d 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (NoDummyInputSize() == 1 && local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f9..c3a044d22cf 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c524..2c1648c81fc 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 8d0d96d391e..f42ee9a697b 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -89,12 +89,21 @@ endif() if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(NOT WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + else(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif(WIN32) set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") if(EXISTS ${MKLDNN_PATH}) include_directories("${MKLDNN_PATH}/include") - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + if(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif(WIN32) endif() else() set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d5..307c3488223 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,7 +17,6 @@ limitations under the License. */ #ifdef _WIN32 #include -#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f9..95ad67e33e2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,9 +44,8 @@ endif() register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) - # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -64,9 +63,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff907..7c0fda4169b 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d76..bf9aef91350 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index ccbd05c82ad..2e3779ff084 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,6 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 6abf3434cc8..2bc740e5983 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index f9a32bfa4c1..1642c178097 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" +#if defined(_WIN32) +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#endif + #ifdef PADDLE_WITH_XBYAK #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" @@ -22,9 +26,8 @@ limitations under the License. */ #ifdef __APPLE__ #include #include - #elif defined(_WIN32) -#define NOMINMAX // msvc max/min macro conflict with std::min/max +#define WIN32_LEAN_AND_MEAN #include #else #include diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 5939c500c94..07159d4a12e 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,9 +16,7 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) -endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index f3cd3b2bbed..91d9a1ef013 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R6 +CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 550fe2edee1..2f4f8101e4b 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using cudnn_func = decltype(&::__name); \ std::call_once(cudnn_dso_flag, []() { \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106c..15d51683665 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,6 +53,12 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; +#endif + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -201,6 +213,8 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif @@ -225,6 +239,8 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 84fd2ce9987..edb4c649add 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -18,6 +18,12 @@ namespace paddle { namespace platform { namespace dynload { +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index f0a97366236..944b00bae10 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -34,7 +34,7 @@ extern void* mklml_dso_handle; #define DYNAMIC_LOAD_MKLML_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using mklmlFunc = decltype(&::__name); \ std::call_once(mklml_dso_flag, []() { \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 5d67658b94a..751aa54b1ad 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 18ed9956f18..bc1977b05de 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using warpctcFunc = decltype(&::__name); \ std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index ad070171df3..41388d8959e 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -37,6 +37,10 @@ #define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include +#ifdef _WINSOCKAPI_ +/* Prevent inclusion of winsock.h in windows.h */ +#define WIN32_LEAN_AND_MEAN +#endif #include #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h @@ -55,7 +59,6 @@ static void *dlsym(void *handle, const char *symbol_name) { static void *dlopen(const char *filename, int flag) { std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); HMODULE hModule = LoadLibrary(file_name.c_str()); if (!hModule) { throw std::runtime_error(file_name + " not found."); diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt index eabb51d370a..af033fa7407 100644 --- a/paddle/fluid/train/demo/CMakeLists.txt +++ b/paddle/fluid/train/demo/CMakeLists.txt @@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc) if(WITH_MKLDNN) include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include") - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) -endif() + if(WIN32) + set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) + endif(WIN32) +endif(WITH_MKLDNN) if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) + if(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib) + else(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) + endif(WIN32) else() if(APPLE) set(MATH_LIB cblas) - else(APPLE) + elseif(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib) + else() set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) endif(APPLE) endif() diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 139176b0d6c..078d543ba2d 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -48,12 +48,18 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in IF(WIN32) # Python would use the .pyd by default under Windows series platform set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) - get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} - DEPENDS paddle_pybind) + if(NOT WITH_MKLDNN) + get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} + DEPENDS paddle_pybind) + else(NOT WITH_MKLDNN) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) + endif(NOT WITH_MKLDNN) ELSE() set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) add_custom_command(OUTPUT ${FLUID_CORE} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2dea71d7af9..fd788d09296 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,6 +102,12 @@ def __bootstrap__(): import sys import os import platform + + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core in_test = 'unittest' in sys.modules @@ -128,13 +134,12 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') if os.name != 'nt': - read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') if core.is_compiled_with_dist(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 08979205946..da74fd41fcc 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,6 +16,7 @@ from __future__ import print_function import collections import contextlib +import os import re import six import sys @@ -27,11 +28,18 @@ from .proto import framework_pb2 try: from . import core except ImportError as e: - raise ImportError( - """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" - if you encounters \"libmkldnn.so not found\" errors. If you have python - installed in other directory, replace \"/usr/local/lib\" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + if os.name == 'nt': + raise ImportError( + """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" + if you encounters \"mkldnn.dll not found\" errors. If you have python + installed in other directory, replace \"c:\python27\lib" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) + else: + raise ImportError( + """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" + if you encounters \"libmkldnn.so not found\" errors. If you have python + installed in other directory, replace \"/usr/local/lib\" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) except Exception as e: raise e from . import unique_name diff --git a/python/setup.py.in b/python/setup.py.in index 65620466412..f4613dd72de 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,27 +158,29 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -if os.name != 'nt': - package_data['paddle.libs']= [] - package_data['paddle.libs']=['libwarpctc' + ext_name] - shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + +package_data['paddle.libs']= [] +package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] +shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': - shutil.copy('${MKLML_LIB}', libs_path) - shutil.copy('${MKLML_IOMP_LIB}', libs_path) - package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] + shutil.copy('${MKLML_SHARED_LIB}', libs_path) + shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) + package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - # TODO(typhoonzero): use install_name_tool to patch mkl libs once - # we can support mkl on mac. - # - # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. - # The reason is that all thirdparty libraries in the same directory, - # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. - command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] + if os.name != 'nt': + # only change rpath in Release mode. + # TODO(typhoonzero): use install_name_tool to patch mkl libs once + # we can support mkl on mac. + # + # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. + # The reason is that all thirdparty libraries in the same directory, + # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. + command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch libmkldnn.so failed, command: %s" % command) + package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) if '${WITH_NGRAPH}' == 'ON': # only change rpath in Release mode, -- GitLab From 2f55a04ec64e32adb1ca9da53b137c25ebd51c1c Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Tue, 18 Dec 2018 11:51:35 +0800 Subject: [PATCH 0311/2367] add refer result comparasion test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 13 ++++++++++--- .../inference/tests/api/analyzer_vis_tester.cc | 16 ++++++++++++++++ paddle/fluid/inference/tests/api/tester_helper.h | 1 + 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 5862fedb9aa..46ce61b7361 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename ARGS --infer_model=${install_dir}/model) endfunction() +function(inference_analysis_api_test_with_refer_result target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) +endfunction() + # RNN1 if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") @@ -83,14 +90,14 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() -inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) -# mobilenet with transpose +# mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") endif() -inference_analysis_api_test(test_analyzer_mobilenet ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 4700afdc86c..a8f7d5c4461 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,6 +93,22 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + std::string line; + std::ifstream file(FLAGS_refer_result); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + auto &output = outputs.front(); + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + CHECK_EQ(numel, refer.data.size()); + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); + } + } } TEST(Analyzer_vis, profile) { profile(); } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 4c8bce4600a..b07949c196c 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -36,6 +36,7 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); +DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); -- GitLab From acd4b759233b44b7ef52f65d21fff07a56f681ab Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 18 Dec 2018 12:25:25 +0800 Subject: [PATCH 0312/2367] skip_opt_set support list (#14845) * test=develop * fix tests. test=develop --- .../fluid/transpiler/memory_optimization_transpiler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 95aafec0536..5a7d04ed194 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -484,8 +484,11 @@ def memory_optimize(input_program, if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") - if skip_opt_set is not None and not isinstance(skip_opt_set, set): - raise ValueError("only support skip_opt_set as set.") + if skip_opt_set is not None: + if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list): + skip_opt_set = set(skip_opt_set) + else: + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: -- GitLab From fb8ae30331f42b6b9ef67c80e0ccb3fffcbf9836 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 12:35:45 +0800 Subject: [PATCH 0313/2367] fix test=develop --- paddle/fluid/framework/operator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 87f61f3afc3..807667e6846 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -919,7 +919,7 @@ Scope* OperatorWithKernel::PrepareData( } auto* trans_var = new_scope->Var(var_name); - input_vars[i] = var; + input_vars[i] = trans_var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); -- GitLab From 7cd24b13182bcdcbdb455a430d54d70172e73a59 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 18 Dec 2018 13:15:29 +0800 Subject: [PATCH 0314/2367] add ir memory optimize. (#14530) * follow comments. test=develop * Fix typo * fix compile error. test=develop * merge develop branch. test=develop * Remove set_equal * Polish code * Delete unused functions test=develop * polish code. test=develop * follow comment * polish code. * fix windows compile error. test=develop * fix op handle. * rerun ci. test=develop * rerun ci. test=develop * rerun macci. test=develop * polish code. test=develop * rewrite sort code. test=develop * remove unused code. test=develop * fix tests. test=develop * fix conflict. test=develop * follow comment. test=develop * merge develop branch. test=develop * fix tests. test=develop * remove ToTypeIndex. test=develop * rerun ci. test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 14 +- .../framework/details/analysis_var_pass.cc | 656 ++++++++++++++++++ .../framework/details/analysis_var_pass.h | 120 ++++ .../details/analysis_var_pass_test.cc | 470 +++++++++++++ .../fluid/framework/details/build_strategy.cc | 34 +- .../fluid/framework/details/build_strategy.h | 11 + .../details/early_delete_op_handle.h | 140 ++++ .../details/memory_early_delete_pass.cc | 117 ++++ .../details/memory_early_delete_pass.h | 32 + .../framework/details/memory_reuse_types.cc | 155 +++++ .../framework/details/memory_reuse_types.h | 87 +++ .../details/memory_reuse_types_test.cc | 99 +++ .../details/multi_devices_graph_print_pass.cc | 3 +- .../details/multi_devices_graph_print_pass.h | 5 +- .../fluid/framework/details/op_handle_base.h | 2 +- paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/framework/ir/graph_helper.cc | 3 +- paddle/fluid/framework/ir/graph_helper.h | 1 + paddle/fluid/framework/ir/node.cc | 8 + paddle/fluid/framework/ir/node.h | 5 +- paddle/fluid/framework/parallel_executor.cc | 19 +- paddle/fluid/framework/tensor_test.cc | 16 + paddle/fluid/pybind/pybind.cc | 8 + python/paddle/fluid/__init__.py | 2 +- .../unittests/parallel_executor_test_base.py | 2 + .../unittests/test_ir_memory_optimize_pass.py | 123 ++++ .../memory_optimization_transpiler.py | 61 +- 27 files changed, 2169 insertions(+), 29 deletions(-) create mode 100644 paddle/fluid/framework/details/analysis_var_pass.cc create mode 100644 paddle/fluid/framework/details/analysis_var_pass.h create mode 100644 paddle/fluid/framework/details/analysis_var_pass_test.cc create mode 100644 paddle/fluid/framework/details/early_delete_op_handle.h create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.cc create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types.cc create mode 100644 paddle/fluid/framework/details/memory_reuse_types.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 97f7713d974..63a68ba3a5c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) - +cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) +cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) @@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass + memory_optimize_pass) diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc new file mode 100644 index 00000000000..223b9da3cfb --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.cc @@ -0,0 +1,656 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +std::unique_ptr AnalysisVarPass::ApplyImpl( + std::unique_ptr graph) const { + auto nodes = graph->Nodes(); + auto subblock_vars = GetSubBlockVars(nodes); + skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.NodeMatch(var); + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + if (cache != nullptr) { + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." + << var->Name() << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + } + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + } + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + if (var_node == nullptr) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node, op); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + // For early delete pass. use GraphNodePool load the unlived vars. + // 1. find all deps op for each unlived var in memory pool. + for (auto& op : graph->Nodes()) { + for (auto& var : op->inputs) { + if (pool_.Has(var)) { + pool_.Insert(var, op); + } + } + } + // 2. convert ir node based memory pool to graph node + // because Node* maybe released bettwen passes. + auto& graph_pool = graph->Get(kGraphNodePool); + for (auto it = pool_.begin(); it != pool_.end(); ++it) { + std::unordered_set descs; + for (auto& op : it->second) { + PADDLE_ENFORCE(op->IsOp()); + descs.insert(op->Op()); + } + graph_pool.push_back(std::make_pair(it->first->Name(), descs)); + } + + return graph; +} + +void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.NodeMatch(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +std::unordered_set AnalysisVarPass::GetSubBlockVars( + const std::unordered_set& nodes) const { + std::unordered_set vars; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + if (OpHasSubBlock(op_desc)) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + vars.insert(inputs.begin(), inputs.end()); + vars.insert(outputs.begin(), outputs.end()); + } + } + return vars; +} + +void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void AnalysisVarPass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { + if (!node->IsVar() || node->IsCtrlVar()) return false; + auto* desc = node->Var(); + auto type = desc->GetType(); + if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || + desc->GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node->Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + if (skip_set_.count(name)) return false; + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + return framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return true; +} + +bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h new file mode 100644 index 00000000000..144204beafb --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.h @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); +// sort op in bfs order +std::vector BFSSortGraphOps(const ir::Graph& graph); + +class ControlFlowGraph; + +class AnalysisVarPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // valid a tensor can be reuse or not + bool NodeCanReused(ir::Node* node) const; + // scan subblock and collect the output/input variables. + std::unordered_set GetSubBlockVars( + const std::unordered_set&) const; + // check op has subblock or not + bool OpHasSubBlock(OpDesc* desc) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedNodePairPool pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // For IR Graph in parallelexecutor + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc new file mode 100644 index 00000000000..9bc4fd33f70 --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +template +inline static std::string DebugString(const Container& c) { + std::stringstream ss; + for (auto& item : c) { + ss << item << " "; + } + return ss.str(); +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f249..779a9ed5236 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -14,11 +14,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" +#include +#include + +#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { @@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } VLOG(1) << "CollectiveContext:" << context->String(); + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy.memory_optimize_) { + auto analysis_var_pass = AppendPass("analysis_var_pass"); + } // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy_.debug_graphviz_path_); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(kGraphvizPath, + new std::string(graph_path)); multi_devices_print_pass->Set( "graph_printer", new details::GraphvizSSAGraphPrinter); } @@ -127,7 +143,6 @@ std::unique_ptr BuildStrategy::Apply( CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); - for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); @@ -145,6 +160,17 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { + const std::vector *all_op_descs = + new std::vector(main_program.Block(0).AllOps()); + graph->Set>(kAllOpDescs, + all_op_descs); // take ownership + graph->Set(kGraphNodePool, + new GraphNodePool); // take ownership + + pass->Erase(kAllOpDescs); + pass->SetNotOwned>(kAllOpDescs, all_op_descs); + } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -166,6 +192,7 @@ std::unique_ptr BuildStrategy::Apply( } return graph; } + } // namespace details } // namespace framework } // namespace paddle @@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be169575..29396501dc0 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -60,8 +60,15 @@ struct BuildStrategy { kCustomized = 2, }; + enum class OptimizeStrategy { + // To be Implemented,bruteforce, recursive compute unused var names. + kBruteForce = 0, + kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. + }; + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; @@ -69,6 +76,10 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool memory_optimize_{false}; + + bool memory_early_delete_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h new file mode 100644 index 00000000000..c8382d34b79 --- /dev/null +++ b/paddle/fluid/framework/details/early_delete_op_handle.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class EarlyDeleteOpHandle : public OpHandleBase { + public: + EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, + const platform::Place& place, + const std::vector& names, + GarbageCollector* gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + names_(names), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(place); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } +#endif + } + ~EarlyDeleteOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif + } + + std::string Name() const override { return "early_delete"; } + + protected: + void RunImpl() override { + std::vector> tensors; + auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + for (auto& var_name : names_) { + auto* var = local_scope->FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, + string::Sprintf("Local Scope not has var %s", var_name)); + if (var->IsType()) { + tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + tensors.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + LoDTensorArray* tensor_array = var->GetMutable(); + for (auto& tensor : *tensor_array) { + tensors.emplace_back(tensor.MoveMemoryHolder()); + } + } + } + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors( + const std::vector>& tensors) { + if (platform::is_cpu_place(place_)) { + ClearCPUTensors(tensors); + } else { + ClearGPUTensors(tensors); + } + } + + void ClearCPUTensors( + const std::vector>& tensors) { + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + gc->Add(tensors); + } + } + + void ClearGPUTensors( + const std::vector>& tensors) { +#ifdef PADDLE_WITH_CUDA + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast(gc_) != nullptr; +#endif + } + + const Scope* scope_; + const platform::Place place_; + std::vector names_; + GarbageCollector* gc_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext* dev_ctx_; + cudaEvent_t event_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc new file mode 100644 index 00000000000..06a2451c136 --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_early_delete_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { + std::queue queue; + queue.push(var_in); + do { + auto* var = queue.front(); + queue.pop(); + for (auto* op : var->PendingOps()) { + auto* compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto* out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + +std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( + std::unique_ptr graph) const { + auto& graph_pool = Get(kGraphNodePool); + auto& gcs = Get(kGarbageCollector); + + std::unordered_map> unlived_vars; + unlived_vars.reserve(graph_pool.size()); + for (auto& pair : graph_pool) { + unlived_vars.insert(std::make_pair(pair.first, pair.second)); + } + + auto compare_and_insert_early_delete_op = [&]( + OpHandleBase* op, const std::vector& vars) { + if (unlived_vars.empty()) return; + // unlived vars can be deleted after the last used op has finished. + auto* compute_op = dynamic_cast(op); + const auto& places = Get>(kAllPlaces); + for (auto& var : vars) { + auto* var_handle = dynamic_cast(var); + auto var_name = var->Node()->Name(); + auto& var_place = var_handle->place_; + if (unlived_vars.count(var_name) == 0) continue; + if (!unlived_vars[var_name].empty()) { + if (compute_op != nullptr && + unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { + unlived_vars[var_name].erase(compute_op->Node()->Op()); + } + continue; + } + + if (var_handle == nullptr || !var_handle->Node()->IsVar() || + var_handle->Node()->IsCtrlVar()) + continue; + + // shameless copyed from reference count pass. + if (compute_op == nullptr) { + // use next computation op scope + compute_op = FindNextComputationOpHandle(var_handle); + } + auto* early_delete_node = + graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); + GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); + auto* early_delete_handle = new EarlyDeleteOpHandle( + early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); + if (compute_op->Outputs().empty()) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + early_delete_handle->AddInput(compute_op->Outputs().front()); + VLOG(5) << "Add early delete op " << var_name << " to Operator" + << compute_op->Name(); + } + }; + + auto all_ops = ir::FilterByNodeWrapper(*graph); + for (auto& op : all_ops) { + compare_and_insert_early_delete_op(op, op->Inputs()); + compare_and_insert_early_delete_op(op, op->Outputs()); + } + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_early_delete_pass, + paddle::framework::details::MemoryEarlyDeletePass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h new file mode 100644 index 00000000000..8215aa1b2ba --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/details/early_delete_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryEarlyDeletePass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc new file mode 100644 index 00000000000..2b9ff518b9a --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +size_t NodeSizeInBytes(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + auto shape = desc->GetShape(); + size_t type_size = SizeOfType(desc->GetDataType()); + int size = 1; + for (auto& s : shape) { + size *= s; + } + return type_size * std::abs(size); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} +// return DebugString(var->Var()); } + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + } else { + return false; + } + } +}; + +void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + PADDLE_ENFORCE(op->IsOp()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->second.insert(op); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator compare_node; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto* cache_desc = FindVarDescInBlock(it->first); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (compare_node(it->first, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = + nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + mark_table_[var->Name()] = it; +} + +int OrderedNodePairPool::GetIndex(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator compare_node; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + if (compare_node(var, it->first)) { + found_node = it->first; + break; + } + } + return found_node; +} + +void OrderedNodePairPool::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedNodePairPool::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + ss << DebugString(it->first) << " "; + } + return ss.str(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h new file mode 100644 index 00000000000..9a9c1d948e8 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kFetchedVars[] = "fetched_vars"; +constexpr char kGraphNodePool[] = "graph_node_pool"; + +// NOTE(dzh): Variable and the operators use the var. +// for early delete pass. +// Because analysis var pass build base on ir::Node, which maybe released +// or modified between passes, so we use OpDesc* to mark ops. +using GraphNodePool = std::vector< + std::pair /* ops */>>; + +// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size is determined in runtime. +// the node batch_size equal -1 always ranking in the front than the node not. +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. +// O(1) insert, delete +class OrderedNodePairPool { + public: + using NodePair = std::pair>; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var, ir::Node* op); + + void Erase(ir::Node* var); + + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + + ir::Node* NodeMatch(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetIndex(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node swap pairs. var -> ops dep var + std::list nodes_; +}; + +// node memory size in bytes +size_t NodeSizeInBytes(ir::Node* n); + +std::string DebugString(ir::Node* var); + +// std::string DebugString(VarDesc* var); +VarDesc* FindVarDescInBlock(ir::Node* n); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc new file mode 100644 index 00000000000..d2fabf5ce06 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(OrderedNodePairPool, Normal) { + OrderedNodePairPool pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get(), op.get()); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 8f92f0948d7..c2030738453 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace paddle REGISTER_PASS(multi_devices_print_pass, - paddle::framework::details::SSAGraghBuilderWithPrinter); + paddle::framework::details::SSAGraghBuilderWithPrinter) + .RequirePassAttr(paddle::framework::details::kGraphvizPath); diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index c00685fa162..b06c87a5c18 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} @@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass { std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); return graph; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61c..b1a82e8771b 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { namespace details { -constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@"; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe28..8670dcfed7e 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -162,7 +162,10 @@ void Graph::ResolveHazard( (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; - PADDLE_ENFORCE(write_op, "The write_op should not be empty."); + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); // Add write after write dependence ir::Node *upstream_op = diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d2d28793c43..d99f856d8f4 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include DEFINE_string(print_sub_graph_dir, "", @@ -121,7 +122,7 @@ std::map> BuildOperationAdjList( } size_t GraphNum(const Graph &graph) { - std::unordered_set nodes = graph.Nodes(); + std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; visited_nodes.reserve(nodes.size()); std::deque q_nodes; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 8d92c406689..be525151f9f 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + // Test if the graph contains circle. bool HasCircle(const Graph &graph); diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index eac67108e21..45d81b93739 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,6 +30,14 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19..89dcc677b57 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -125,6 +124,8 @@ class Node { friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} @@ -152,7 +153,9 @@ class Node { std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index eb4baa06b52..7e3fe02eaf5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include #include #include @@ -93,6 +94,7 @@ class ParallelExecutorPrivate { } } + BuildStrategy build_strategy_; std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -169,6 +171,14 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + if (build_strategy_.memory_early_delete_) { + auto early_delete_pass = + ir::PassRegistry::Instance().Get("memory_early_delete_pass"); + early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = early_delete_pass->Apply(std::move(graph)); + } + VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor( build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif - auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { graph = member_->PrepareGCAndRefCnts(std::move(graph), @@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor( if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a0a9a573603..83dea863901 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -74,6 +74,22 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); } // Not sure if it's desired, but currently, Tensor type can be changed. { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 017598e1707..737ae2dd9c3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) { return self.memory_optimize_; }, + [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "memory_early_delete", + [](const BuildStrategy &self) { return self.memory_early_delete_; }, + [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2dea71d7af9..b00510d4438 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -150,7 +150,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c2..e2a9fc183ea 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, + use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, @@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py new file mode 100644 index 00000000000..6ca65c5d3b6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def _feed_data_helper(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + return img, label + + +def simple_fc_net(use_feed): + x, y = _feed_data_helper(use_feed) + hidden_layer = 4 + for _ in range(hidden_layer): + x = fluid.layers.fc(input=x, size=20, act='relu') + y_predict = fluid.layers.fc(input=x, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +def fc_with_inplace_net(use_feed): + x, y = _feed_data_helper(use_feed) + fc = fluid.layers.fc(input=x, size=20, act='relu') + fc = fluid.layers.fc(input=fc, size=10, act='relu') + reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5]) + reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _dummy_data(self): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_ir_and_python_memory_optimize(self, model, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._dummy_data() + first_loss0, last_loss0 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=False) + first_loss1, last_loss1 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=True) + for loss in zip(first_loss0, first_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + for loss in zip(last_loss0, last_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_net(self): + self._compare_ir_and_python_memory_optimize(simple_fc_net, False) + self._compare_ir_and_python_memory_optimize(simple_fc_net, True) + + def test_fc_with_reshape_net(self): + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False) + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 5a7d04ed194..7b530ba6174 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -43,6 +43,7 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False +FLAGS_memory_optimize = "" class OrderedSet(MutableSet): @@ -121,6 +122,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(OrderedSet) self._live_in = defaultdict(OrderedSet) self._live_out = defaultdict(OrderedSet) + self._skip_opt = skip_opt self.pool = [] @@ -144,7 +146,6 @@ class ControlFlowGraph(object): for i in range(self.op_size): self._uses[i].update(self._ops[i].input_arg_names()) self._defs[i].update(self._ops[i].output_arg_names()) - self._live_in[i] = self._uses[i] def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -177,20 +178,52 @@ class ControlFlowGraph(object): worklist.append(d) def _fill_pool(self, i, is_forward): + def comparator(x, cache): + x_shape = x[1] + cache_shape = cache[1] + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if (x_shape[0] == -1 and cache_shape[0] == -1) or \ + (x_shape[0] != -1 and cache_shape[0] != -1) : + return x_size <= cache_size + else: + return False + + def find_var_in_block(x): + known_vars = set() + for op in self._ops: + known_vars.update(op.output_arg_names()) + return x in known_vars + block_desc = self._ops[i].block() in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in in_diff + x for x in sorted(in_diff) if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: for var_name in can_optimize: cache = (var_name, self._find_var(block_desc, var_name, is_forward).shape()) - if cache not in self.pool: - self.pool.append(cache) + if cache not in self.pool and find_var_in_block(var_name): + i = 0 + while i < len(self.pool): + mycache = self.pool[i] + mysize = mycache[1][0] + cache_size = cache[1][0] + if (mysize == -1 and cache_size == -1) or \ + (mysize != -1 and cache_size != -1): + if comparator(mycache, cache): + i += 1 + else: + break + elif mysize == -1 and cache_size != -1: + i += 1 + elif mysize != -1 and cache_size == -1: + break + self.pool.insert(i, cache) def _get_diff(self, a, b): u = a & b @@ -229,7 +262,7 @@ class ControlFlowGraph(object): def _update_skip_opt_set(self): for i in range(self.op_size): op = self._ops[i] - if op.type() == "fill_constant" and op.attr("force_cpu") == True: + if op.has_attr("force_cpu") and op.attr("force_cpu") == True: self._skip_opt.update(op.output_arg_names()) def release_memory(self, skip_opt_set=None): @@ -281,6 +314,7 @@ class ControlFlowGraph(object): # update skip set to meet users' demand if skip_opt_set: self._skip_opt.update(skip_opt_set) + counter = 0 for i in range(self.op_size): op = self._ops[i] if op.type() in SUB_BLOCK_OPS: @@ -301,6 +335,9 @@ class ControlFlowGraph(object): # If x is both in uses and defs, it can not be optimized! if x in self._uses[i]: continue + if x == FLAGS_memory_optimize: + print("start match var ", x, " of op ", op.type()) + print(self.pool) for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] @@ -323,15 +360,13 @@ class ControlFlowGraph(object): if not compare_shape(x_shape, cache_shape, level): continue # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype] - if x_dtype != cache_dtype: - continue - if PRINT_LOG: - print(("Hit Cache !!!! cache pool index " - "is %d, var name is %s, " - "cached var name is %s, " - "var shape is %s ") % (index, x, cache_var, - str(cache_shape))) + print( + ("!!! %d, %s => %s, cache idx %d, pool size %d" + % (counter, x + str(x_shape), + cache_var + str(cache_shape), index, + len(self.pool)))) + counter += 1 self.pool.pop(index) # Rename the var to the cache var already with # memory allocated in order to reuse the memory. -- GitLab From 001891aea69a96a725c8026a82d5a7dd45ed558f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 13:15:54 +0800 Subject: [PATCH 0315/2367] fix code style test=develop --- python/paddle/fluid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fd788d09296..ecf75f72822 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -104,7 +104,8 @@ def __bootstrap__(): import platform if os.name == 'nt': - third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs' + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' os.environ['path'] += ';' + third_lib_path sys.path.append(third_lib_path) -- GitLab From b601f2de8d3d1a336810438c521714749f8a19a6 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 13:38:08 +0800 Subject: [PATCH 0316/2367] include the mkl fix only test=develop --- CMakeLists.txt | 4 +- cmake/cuda.cmake | 3 - cmake/cudnn.cmake | 1 - cmake/operators.cmake | 2 +- cmake/simd.cmake | 73 ++++++++++--------- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/mixed_vector.h | 10 +-- paddle/fluid/framework/op_registry.h | 3 +- .../fluid/memory/detail/system_allocator.cc | 1 + paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/cum_op.h | 2 - .../elementwise/elementwise_mul_mkldnn_op.cc | 3 - .../operators/math/detail/lstm_cpu_kernel.h | 6 -- paddle/fluid/operators/math/jit_gen.h | 3 - paddle/fluid/platform/dynload/CMakeLists.txt | 2 + paddle/fluid/platform/dynload/cudnn.cc | 4 - .../fluid/platform/dynload/dynamic_loader.cc | 16 ---- python/setup.py.in | 9 +-- 18 files changed, 61 insertions(+), 91 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efdb451f659..aa9446a694d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,10 +203,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of nccl, cupti in windows +# there is no official support of warpctc, nccl, cupti in windows +include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5be7be64137..414e92eb27f 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,12 +139,10 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) - add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -152,7 +150,6 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") - add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 96a9917e762..09bec347dbd 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,7 +89,6 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() - add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 70d159b4f35..2ced43f9e6c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda0..86096d4feaa 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,43 +57,46 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) +# disable AVX by default on windows +if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) + # Check AVX 2 + set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) + set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; + }" AVX2_FOUND) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 90083f690fe..225dfb3e700 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -15,7 +15,8 @@ function(windows_symbolic TARGET) file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22cf..6940250c3f9 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable paddle::memory::AllocationPtr gpu_; + mutable memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 2c1648c81fc..6d39bb3c524 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,8 +23,7 @@ limitations under the License. */ #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 307c3488223..3e8fb83e9d5 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,6 +17,7 @@ limitations under the License. */ #ifdef _WIN32 #include +#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 95ad67e33e2..257bfc0a3f9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,8 +44,9 @@ endif() register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + # warpctc_op needs cudnn 7 above -if (WITH_GPU) +if (WITH_GPU AND NOT WIN32) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -63,7 +64,9 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +if (NOT WIN32) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 7c0fda4169b..999fdcff907 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index bf9aef91350..c600d1e3d76 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,9 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 2e3779ff084..ccbd05c82ad 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,12 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" -#if defined(_WIN32) -#if defined(__AVX2__) || defined(__AVX__) -inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } -#endif -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 2bc740e5983..6abf3434cc8 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 07159d4a12e..5939c500c94 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,7 +16,9 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) +if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 91d9a1ef013..f3cd3b2bbed 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,10 +38,6 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif -#ifdef CUDNN_DNN_ROUTINE_EACH_R6 -CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); -#endif - #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 15d51683665..cc5cda6106c 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,12 +53,6 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; -#endif - static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -171,8 +165,6 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -181,8 +173,6 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -203,8 +193,6 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -213,8 +201,6 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif @@ -239,8 +225,6 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); #endif diff --git a/python/setup.py.in b/python/setup.py.in index f4613dd72de..ff3aca5714c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,11 +158,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' - -package_data['paddle.libs']= [] -package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] -shutil.copy('${WARPCTC_LIBRARIES}', libs_path) - +if os.name != 'nt': + package_data['paddle.libs']= [] + package_data['paddle.libs']=['libwarpctc' + ext_name] + shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) -- GitLab From 41790f13662a8a86fe5b6f4e3cee7a35703230a8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 14:04:40 +0800 Subject: [PATCH 0317/2367] add ut about nce --- .../unittests/test_nce_remote_table_op.py | 152 ++++-------------- 1 file changed, 33 insertions(+), 119 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index f08b270d89b..e87545cb9c6 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -88,158 +88,73 @@ class TestListenAndServOp(unittest.TestCase): port = int(f.read().strip()) return port - def _run_nce_op_one_pserver(self, place, port): + def _run_nce_op_two_pserver(self, place, port0, port1): scope = fluid.core.Scope() program = Program() with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() + x = scope.var('Input').get_tensor() x_array = np.random.random((4, 8)).astype("float32") * 2 x.set(x_array, place) # create and initialize Param Variable - param = scope.var('W').get_tensor() + param = scope.var('Weight').get_tensor() param_array = np.zeros((5, 8)).astype("float32") * 2 param.set(param_array, place) - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) - - label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) - label.set(label_array, place) - bias = scope.var('Bias').get_tensor() bias_array = np.random.random((5, 1)).astype("float32") bias.set(bias_array, place) - out = scope.var('Out').get_tensor() - - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) - - emaps = ['127.0.0.1:' + str(port)] - table_names = ['table'] - height_sections = [2] - - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', - Label='Label', - Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', - remote_prefetch=True, - epmap=emaps, - table_names=table_names, - height_sections=height_sections) - - hsigmoid_op.run(scope, place) - - # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i != 3: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), 0).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - - def _run_nce_op_two_pserver(self, place, port0, port1): - scope = fluid.core.Scope() - program = Program() - with fluid.scope_guard(scope): - with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 - x.set(x_array, place) - # create and initialize Param Variable - param = scope.var('W').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 - param.set(param_array, place) - - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) + sample_w = scope.var('SampleWeight').get_tensor() + sample_weight = np.random.random((4, 1)).astype("float32") + sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() label_array = np.array([0, 1, 4, 5]) label.set(label_array, place) - bias = scope.var('Bias').get_tensor() - bias_array = np.random.random((5, 1)).astype("float32") - bias.set(bias_array, place) + cost = scope.var('Cost').get_tensor() + cost_w = np.zeros((4, 1)).astype("float32") + cost.set(cost_w, place) - out = scope.var('Out').get_tensor() + sample_l = scope.var('SampleLogits').get_tensor() + sample_l_w = np.zeros((4, 3)).astype("float32") + sample_l.set(sample_l_w, place) - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) + sample_la = scope.var('SampleLabels').get_tensor() + sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] table_names = ['table', 'table'] height_sections = [2, 3] - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', + # create and run nce operator + nce_op = Operator( + "nce", + Input='Input', + Weight='Weight', Label='Label', Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', + Cost='Cost', + SampleLogits='SampleLogits', + SampleLabels='SampleLabels', + num_total_classes=5, + num_neg_samples=2, + sampler=0, + seed=1, + is_sparse=True, remote_prefetch=True, epmap=emaps, table_names=table_names, height_sections=height_sections) - hsigmoid_op.run(scope, place) + + nce_op.run(scope, place) # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i < 2: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), i + 9).astype("float32") - self.assertTrue((result_array[i] == correct).all()) + o_cost = np.array(cost_w) + o_logits = np.array(sample_l) + o_labels = np.array(sample_la) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" @@ -257,7 +172,6 @@ class TestListenAndServOp(unittest.TestCase): places.append(core.CUDAPlace(0)) for place in places: - self._run_nce_op_one_pserver(place, port0) self._run_nce_op_two_pserver(place, port0, port1) # raise SIGTERM to pserver -- GitLab From a3fa3f85d7bd4fb948b0401d77d5c60498e5a329 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:04:26 +0800 Subject: [PATCH 0318/2367] Polish code test=develop --- paddle/fluid/platform/enforce.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3c03a902796..d1dd09f2064 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -260,12 +260,12 @@ inline void throw_on_error(T e) { #define PADDLE_JUDGE -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(cond))) { \ - ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ - } \ +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ + } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG -- GitLab From 69642000dc3a83b3dad5a33052da1eff1f450b6d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:09:01 +0800 Subject: [PATCH 0319/2367] Hide KeyHasher test=develop --- paddle/fluid/framework/scope.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 9a715ac9b95..797d1101593 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -38,14 +38,6 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; -namespace inner { -struct KeyHasher { - std::size_t operator()(const std::string& key) const { - return XXH32(key.c_str(), key.size(), 1); - } -}; -} // namespace inner - /** * @brief Scope that manage all variables. * @@ -110,8 +102,13 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map, - inner::KeyHasher> + struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } + }; + + mutable std::unordered_map, KeyHasher> vars_; private: -- GitLab From 70981f5d799b5ab1593743b6ec88af6c40698a3b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:30:23 +0800 Subject: [PATCH 0320/2367] clean test=develop --- paddle/fluid/framework/operator.cc | 36 ++++++++++++------------------ paddle/fluid/framework/operator.h | 16 ++++++------- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 807667e6846..7d5a6198a03 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,14 +143,12 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -441,22 +439,13 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -Variable* ExecutionContext::OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); -} - -const Variable* ExecutionContext::FastInputVar(const std::string& name) const { - auto it = ctx_.inputs.find(name); - if (it == ctx_.inputs.end()) return nullptr; - - PADDLE_ENFORCE_LE(it->second.size(), 1UL, - "Operator %s's input %s should contain only one variable.", - op_.Type(), name); - return it->second.empty() ? nullptr : it->second[0]; +const Variable* ExecutionContext::LegacyInputVar( + const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); } -Variable* ExecutionContext::FastOutputVar(const std::string& name) const { +Variable* ExecutionContext::OutputVar(const std::string& name) const { auto it = ctx_.outputs.find(name); if (it == ctx_.outputs.end()) return nullptr; @@ -466,15 +455,20 @@ Variable* ExecutionContext::FastOutputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } +Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const { - return FastInput(name); + return LegacyInput(name); } template <> @@ -502,8 +496,8 @@ Tensor* ExecutionContext::Output(const std::string& name) const { } template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const { - return FastOutput(name); +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { + return LegacyOutput(name); } template <> @@ -870,7 +864,6 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; - LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -931,7 +924,6 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); - LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0aad91dbeef..39190d07b4c 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -233,20 +233,20 @@ class ExecutionContext { } template - const T* FastInput(const std::string& name) const { - auto* var = FastInputVar(name); + const T* LegacyInput(const std::string& name) const { + auto* var = LegacyInputVar(name); return var == nullptr ? nullptr : &var->Get(); } template - T* FastOutput(const std::string& name) const { - auto var = FastOutputVar(name); + T* LegacyOutput(const std::string& name) const { + auto var = LegacyOutputVar(name); return var == nullptr ? nullptr : var->GetMutable(); } - const Variable* FastInputVar(const std::string& name) const; + const Variable* LegacyInputVar(const std::string& name) const; - Variable* FastOutputVar(const std::string& name) const; + Variable* LegacyOutputVar(const std::string& name) const; template const std::vector MultiInput(const std::string& name) const { @@ -314,7 +314,7 @@ template <> const Tensor* ExecutionContext::Input(const std::string& name) const; template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const; template <> @@ -325,7 +325,7 @@ template <> Tensor* ExecutionContext::Output(const std::string& name) const; template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const; +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; template <> std::vector ExecutionContext::MultiOutput( diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index b6155ed3dd4..62c55c4f557 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.FastInput("X")->type(), + return framework::OpKernelType(ctx.Input("X")->type(), ctx.device_context()); } }; -- GitLab From fa135bbf525fba34e16f9c1e80a35382c5b1c983 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 15:38:33 +0800 Subject: [PATCH 0321/2367] Fix the mkl build script on windows test=develop --- cmake/external/mklml.cmake | 58 +++++++++---------- .../elementwise/elementwise_mul_mkldnn_op.cc | 3 + .../operators/math/detail/lstm_cpu_kernel.h | 6 ++ paddle/fluid/operators/math/jit_gen.h | 3 + python/CMakeLists.txt | 17 +----- python/setup.py.in | 6 ++ 6 files changed, 48 insertions(+), 45 deletions(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 3da552e3190..505f8b38344 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -44,40 +44,36 @@ else() endif() SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") -if(WIN32) - MESSAGE(WARNING - "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml) -else() - SET(MKLML_PROJECT "extern_mklml") - IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) +IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) + MESSAGE(STATUS "use pre defined download url") + if(WIN32) + SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) + SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE) + else() + SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) ENDIF() - MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") - SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") - SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") - - FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" - " DESTINATION ${MKLML_DST_DIR})\n") - - ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${MKLML_SOURCE_DIR} - DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz - && tar zxf ${MKLML_VER}.tgz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} - ) endif() +SET(MKLML_PROJECT "extern_mklml") +MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + URL ${MKLML_URL} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} && + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR} +) INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d76..bf9aef91350 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index ccbd05c82ad..2e3779ff084 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,6 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 6abf3434cc8..2bc740e5983 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 078d543ba2d..72c0d03e522 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -49,23 +49,12 @@ IF(WIN32) # Python would use the .pyd by default under Windows series platform set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) - if(NOT WITH_MKLDNN) - get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} - DEPENDS paddle_pybind) - else(NOT WITH_MKLDNN) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) - endif(NOT WITH_MKLDNN) ELSE() set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) ENDIF() +add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) IF(WIN32) diff --git a/python/setup.py.in b/python/setup.py.in index ff3aca5714c..8973d883e44 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -166,6 +166,12 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] +else: + # copy the openblas.dll + if os.name == 'nt': + shutil.copy('${CBLAS_LIBRARIES}', libs_path) + package_data['paddle.libs']+=['openblas' + ext_name] + if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': if os.name != 'nt': -- GitLab From f897bd16c0e4deb683075e137e7bfe5890488205 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:40:23 +0800 Subject: [PATCH 0322/2367] clean test=develop --- paddle/fluid/framework/operator.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7d5a6198a03..8c83748668e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -812,6 +812,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { @@ -919,13 +921,6 @@ Scope* OperatorWithKernel::PrepareData( SetTensorToVariable(*var, out, trans_var); } } - for (auto& var_name_item : Outputs()) { - std::vector& output_vars = ctx->outputs[var_name_item.first]; - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto& var_name = var_name_item.second[i]; - output_vars[i] = scope.FindVar(var_name); - } - } return new_scope; } -- GitLab From 17fb3253c30f4ebeb8f6058a6e770344e52a0fad Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 15:41:54 +0800 Subject: [PATCH 0323/2367] keep the mkl win's version inconsistent with Linux's test=develop --- cmake/external/mklml.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 505f8b38344..96676f0be89 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -47,8 +47,8 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) MESSAGE(STATUS "use pre defined download url") if(WIN32) - SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) - SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE) + SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.16/${MKLML_VER}.zip" CACHE STRING "" FORCE) else() SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) -- GitLab From aed3872c1c5c0c9957f9567071f63a89c1ace455 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 16:17:20 +0800 Subject: [PATCH 0324/2367] add int cast, test=develop --- python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index e87545cb9c6..5e440bf35d2 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -141,6 +141,7 @@ class TestListenAndServOp(unittest.TestCase): SampleLabels='SampleLabels', num_total_classes=5, num_neg_samples=2, + custom_neg_classes=list(range(2)), sampler=0, seed=1, is_sparse=True, -- GitLab From dc8847af876e678e23d0c0125bedd5cfae47ec9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 18 Dec 2018 08:36:44 +0000 Subject: [PATCH 0325/2367] add examples and comments test=develop --- paddle/fluid/operators/py_func_op.cc | 77 ++++++++++++++++++---------- python/paddle/fluid/layers/nn.py | 41 +++++++++++++++ 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 1bee3d9351b..a2895b54043 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -43,9 +43,12 @@ static py::object *GetPythonCallableObject(size_t i) { return &g_py_callables[i]; } -static std::string PythonObjectToString(const py::object &py_callable) { +static std::string PythonFuncDebugString(const py::object &py_callable) { py::gil_scoped_acquire guard; - return py::str(*py_callable); + std::string wrapper_func_str = py::str(py_callable); + auto inner_func = py_callable.attr("_func"); + std::string inner_func_str = py::str(inner_func); + return inner_func_str + " wrapped by " + wrapper_func_str; } static void CallPythonFunc(py::object *callable, @@ -93,15 +96,29 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(!ctx->IsRuntime(), "Infer shape cannot be called in runtime."); + + /** + * X or Out can be empty, so that py_func can be more flexible + * to support Python functions with no input or no output + */ PADDLE_ENFORCE(ctx->HasInputs("X") || ctx->HasOutputs("Out"), "Input(X) or Output(Out) must exist"); PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, "Function id cannot be less than 0"); - // Transverse all outputs - // If name of any output ends with @GRAD, - // set its shape, dtype, lod_level, type to be the same as - // the correponding forward variable + /** + * Traverse all outputs, check if name of any output ends with @GRAD. + * If found, set its shape, dtype, lod_level, type to be the same as + * the corresponding forward variable + * + * Why not get input dims from InferShapeContext? + * Because some variables in forward inputs/outputs may not be needed + * in backward. Those variables are not inside InferShapeContext. + * + * InferShape would be only called in compile time. During runtime, + * the shapes of outputs should be guaranteed by user-defined Python + * functions. + */ auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); const std::string kGradVarSuffix = framework::kGradVarSuffix; @@ -113,7 +130,7 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { } auto out_name = out_var_desc->Name(); if (out_name == framework::kEmptyVarName || - out_name.size() <= kGradVarSuffix.size()) { + out_name.size() < kGradVarSuffix.size()) { continue; } @@ -152,7 +169,28 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { } }; +/** + * There are several benefits when backward op of py_func op is + * still py_func op. + * + * - Less codes are needed, since codes of backward is almost + * the same as forward. + * + * - To support high order derivative, so that py_func is + * infinite-order differentiable + */ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { + private: + static std::string DebugString(const std::vector &strs) { + if (strs.empty()) return ""; + std::string ret = strs[0]; + for (size_t i = 1; i < strs.size(); ++i) { + ret += " "; + ret += strs[i]; + } + return ret; + } + public: using framework::GradOpDescMakerBase::GradOpDescMakerBase; @@ -207,21 +245,8 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { // But in Python side, if IG is not needed, users can just return None auto bwd_outs = InputGrad("X", false); - if (VLOG_IS_ON(10)) { - std::string in_str = "PyFunc Grad Input: "; - for (auto &in : bwd_ins) { - in_str += in; - in_str += " "; - } - VLOG(10) << in_str; - - std::string out_str = "PyFunc Grad Output: "; - for (auto &out : bwd_outs) { - out_str += out; - out_str += " "; - } - VLOG(10) << out_str; - } + VLOG(10) << "PyFunc Grad Input: " << DebugString(bwd_ins); + VLOG(10) << "PyFunc Grad Output: " << DebugString(bwd_outs); grad_op->SetInput("X", bwd_ins); grad_op->SetOutput("Out", bwd_outs); @@ -245,6 +270,7 @@ class PyFuncOp : public framework::OperatorBase { std::vector inputs(in_arg_names.size()); for (size_t i = 0; i < in_arg_names.size(); ++i) { auto in_var = scope.FindVar(in_arg_names[i]); + // When py_func op is called in backward, in_var may be null if (in_var == nullptr) { continue; } @@ -263,15 +289,14 @@ class PyFuncOp : public framework::OperatorBase { std::vector outputs(out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { auto *out_var = scope.FindVar(out_arg_names[i]); - auto *out_tensor = + outputs[i] = out_var ? out_var->GetMutable() : nullptr; - outputs[i] = out_tensor; } auto callable_id = static_cast(Attr(kForwardPythonCallableId)); auto *py_callable = GetPythonCallableObject(callable_id); - VLOG(10) << "Call py_func_op with id " << callable_id << ": " - << PythonObjectToString(*py_callable); + VLOG(10) << "Call Python function with id " << callable_id << ": " + << PythonFuncDebugString(*py_callable); CallPythonFunc(py_callable, inputs, &outputs); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3cd0a2887e5..ab3fb1e97ec 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9243,6 +9243,47 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): Returns: out (Variable|list(Variable)|tuple(Variable)): input :code:`out` + + Examples: + + >>> import paddle.fluid as fluid + >>> import six + >>> + >>> def create_tmp_var(name, dtype, shape): + >>> return fluid.default_main_program().current_block().create_var( + >>> name=name, dtype=dtype, shape=shape) + >>> + >>> # tanh activation has been provided by Paddle C++ op + >>> # Here, we only use tanh to be an example to show the usage + >>> # of py_func + >>> def tanh(x): + >>> return np.tanh(x) + >>> + >>> # forward input x is skipped + >>> def tanh_grad(y, dy): + >>> return np.array(dy) * (1 - np.square(np.array(y))) + >>> + >>> def debug_func(x): + >>> print(x) + >>> + >>> def simple_net(img, label): + >>> hidden = img + >>> for idx in six.moves.range(4): + >>> hidden = fluid.layers.fc(hidden, size=200) + >>> new_hidden = create_tmp_var(name='hidden_{}'.format(idx), + >>> dtype=hidden.dtype, shape=hidden.shape) + >>> + >>> # user-defined layers with forward and backward + >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, + >>> out=new_hidden, backward_func=tanh_grad, + >>> skip_vars_in_backward_input=hidden) + >>> + >>> # user-defined debug layers to print variables + >>> fluid.layers.py_func(func=debug_func, x=hidden, out=None) + >>> + >>> prediction = fluid.layers.fc(hidden, size=10, act='softmax') + >>> loss = fluid.layers.cross_entropy(input=prediction, label=label) + >>> return fluid.layers.mean(loss) """ helper = LayerHelper('py_func', **locals()) if x is None: -- GitLab From b5fa916413aebd0d35af8b3ae04d4d555ecb4629 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 08:38:52 +0000 Subject: [PATCH 0326/2367] fix bug after merge reyoung optimization, test=develop --- .../fluid/operators/hierarchical_sigmoid_op.h | 1 - .../fluid/operators/math/matrix_bit_code.cc | 35 ------------------- paddle/fluid/operators/math/matrix_bit_code.h | 29 +++++++-------- 3 files changed, 15 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 802b444d7ca..b47bf49ecb7 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -71,7 +71,6 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { // server auto height_sections = ctx.Attr>("height_sections"); auto table_names = ctx.Attr>("table_names"); - VLOG(3) << "path type is " << path->type().name(); std::vector real_rows = PathToRows(*path); framework::Scope& local_scope = ctx.scope().NewScope(); auto* ids = local_scope.Var("Ids@Prefetch"); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index d55e832cc2d..d6f51c6e5c6 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -84,41 +84,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, code_table_.apply_visitor(func); } -template -struct MatrixBitCodeFunctorSelectedRowsAddGrad - : public boost::static_visitor { - const framework::Tensor &tmat_; - framework::SelectedRows *vec_; - - MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) - : tmat_(tmat), vec_(vec) {} - - template - void operator()(const CodeTable &code_table) { - size_t batch_size = tmat_.dims()[0]; - size_t width = tmat_.dims()[1]; - auto *vec_data = vec_->mutable_value()->template data(); - auto *tmat_data = tmat_.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table.get_code(i); - int code_length = code.get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); - int64_t row_index = vec_->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; - } - } - } -}; - -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) { - MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); - code_table_.apply_visitor(func); -} - template struct MatrixBitCodeFunctorSum : public boost::static_visitor { const framework::Tensor &tmat_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 7a084a41e5a..c399cb5d44a 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -124,11 +124,12 @@ class SimpleCode { template class CustomCode { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) { - seq_len_ = ptable.dims()[1]; - ptable_data_ = ptable.data() + seq_len_ * index; - pcode_data_ = pcode.data() + seq_len_ * index; + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, + int index) { + seq_len_ = path_table.dims()[1]; + path_table_data_ = path_table.data() + seq_len_ * index; + path_code_data_ = path_code.data() + seq_len_ * index; } /** * Here the id of root should be 1 rather than 0, thus the encoding of class c @@ -139,25 +140,25 @@ class CustomCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_data_[bit]; } - bool calc_bit(int bit) const { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return path_table_data_[bit]; } + bool calc_bit(int bit) const { return path_code_data_[bit]; } // NOTE: this function is not thread-safe. int get_length() const { if (length_ < 0) { auto len = seq_len_; - length_ = - static_cast(std::find_if(ptable_data_, ptable_data_ + len, - [](const T& val) { return val < 0; }) - - ptable_data_); + length_ = static_cast( + std::find_if(path_table_data_, path_table_data_ + len, + [](const T& val) { return val < 0; }) - + path_table_data_); } return length_; } private: int64_t seq_len_; - const T* ptable_data_; - const T* pcode_data_; + const T* path_table_data_; + const T* path_code_data_; mutable int length_{-1}; }; @@ -214,7 +215,7 @@ class MatrixBitCodeFunctor { const framework::Tensor& path_code, const int64_t* ids) : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ -- GitLab From fdab7f749e68c86cc732c7570bbe327d630f5dc9 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 17:04:48 +0800 Subject: [PATCH 0327/2367] fix the setup script issue test=develop --- python/setup.py.in | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 8973d883e44..bfbaa1d0157 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -138,8 +138,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF': '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]} -if os.name == 'nt': - package_data['paddle.fluid'] += ['openblas' + ext_name] if '${WITH_FLUID_ONLY}'== 'OFF': package_data['paddle.v2.master']=['libpaddle_master' + ext_name] @@ -162,15 +160,16 @@ if os.name != 'nt': package_data['paddle.libs']= [] package_data['paddle.libs']=['libwarpctc' + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] else: - # copy the openblas.dll if os.name == 'nt': - shutil.copy('${CBLAS_LIBRARIES}', libs_path) - package_data['paddle.libs']+=['openblas' + ext_name] + # copy the openblas.dll + shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path) + package_data['paddle.fluid'] += ['openblas' + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': -- GitLab From a500dfa579907d8046e40a15e67558c350498976 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 18 Dec 2018 06:27:32 +0000 Subject: [PATCH 0328/2367] rewrite ddim test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/array.h | 74 ++- paddle/fluid/framework/ddim.cc | 303 ++++-------- paddle/fluid/framework/ddim.h | 148 ++++-- paddle/fluid/framework/dim.h | 441 ++++++------------ paddle/fluid/framework/dlpack_tensor.cc | 6 +- paddle/fluid/framework/dlpack_tensor.h | 2 +- paddle/fluid/framework/unroll_array_ops.h | 169 +++++++ .../fluid/operators/controlflow/logical_op.cc | 2 - paddle/fluid/operators/crop_op.h | 1 - paddle/fluid/operators/cudnn_lstm_op.cu.cc | 1 - .../fluid/operators/detail/strided_memcpy.h | 38 +- .../detection/generate_proposal_labels_op.cc | 2 - .../detection/generate_proposals_op.cc | 6 - .../detection/rpn_target_assign_op.cc | 1 - .../operators/elementwise/elementwise_op.h | 1 - paddle/fluid/operators/expand_op.h | 1 - paddle/fluid/operators/fc_op.cc | 1 - .../fused/fused_embedding_fc_lstm_op.cc | 18 +- paddle/fluid/operators/hinge_loss_op.cc | 1 - paddle/fluid/operators/log_loss_op.cc | 1 - .../fluid/operators/math/math_function_impl.h | 3 - paddle/fluid/operators/math/softmax_impl.h | 1 - .../fluid/operators/modified_huber_loss_op.cc | 1 - paddle/fluid/operators/mul_op.cc | 6 - paddle/fluid/operators/nce_op.cc | 1 - paddle/fluid/operators/norm_op.h | 1 - paddle/fluid/operators/psroi_pool_op.h | 1 - .../sequence_ops/sequence_slice_op.h | 2 - paddle/fluid/operators/strided_memcpy.h | 2 +- 30 files changed, 622 insertions(+), 615 deletions(-) create mode 100644 paddle/fluid/framework/unroll_array_ops.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9e..023118d7407 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -36,7 +36,7 @@ add_subdirectory(details) proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) -cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index be9efcd7492..aa0abc22a6b 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -15,34 +15,88 @@ #pragma once #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { + template class Array { - static_assert(N > 0, "The size of array must be larger than 0"); - public: - HOSTDEVICE Array() {} + static constexpr size_t kSize = N; - HOSTDEVICE explicit Array(const T &val) { - for (size_t i = 0; i < N; ++i) data_[i] = val; + HOSTDEVICE inline Array() = default; + + template + HOSTDEVICE inline explicit Array(const T &val, Args... args) { + UnrollVarArgsAssign::Run(data_, val, args...); } - HOSTDEVICE const T *Get() const { return data_; } + HOSTDEVICE inline void Fill(const T &val) { + UnrollFillConstant::Run(data_, val); + } - HOSTDEVICE T *GetMutable() { return data_; } + HOSTDEVICE inline const T *Get() const { return data_; } - HOSTDEVICE T &operator[](size_t index) { return data_[index]; } + HOSTDEVICE inline T *GetMutable() { return data_; } - HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } + HOSTDEVICE inline T &operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T &operator[](size_t index) const { + return data_[index]; + } HOSTDEVICE constexpr size_t size() const { return N; } + HOSTDEVICE inline bool operator==(const Array &other) const { + return UnrollCompare::Run(data_, other.data_); + } + + HOSTDEVICE inline bool operator!=(const Array &other) const { + return !(*this == other); + } + private: T data_[N]; }; +template +class Array { + public: + static constexpr size_t kSize = 0; + + HOSTDEVICE inline Array() = default; + + HOSTDEVICE inline void Fill(const T &val) {} + + HOSTDEVICE inline constexpr T *Get() const { return nullptr; } + + // Add constexpr to GetMutable() cause warning in MAC + HOSTDEVICE inline T *GetMutable() { return nullptr; } + + HOSTDEVICE inline T &operator[](size_t index) { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline const T &operator[](size_t index) const { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE constexpr size_t size() const { return 0; } + + HOSTDEVICE constexpr bool operator==(const Array &other) const { + return true; + } + + HOSTDEVICE constexpr bool operator!=(const Array &other) const { + return false; + } +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 05e423b8a52..3640138e180 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,201 +18,131 @@ limitations under the License. */ namespace paddle { namespace framework { -/// @cond HIDDEN +template +struct DDimAssignFunctor { + static_assert(std::is_integral::value, "T must be integral type"); + using result_type = void; + explicit DDimAssignFunctor(const T* in) : in_(in) {} -template -Dim make_dim(const int64_t* d) { - return Dim(*d, make_dim(d + 1)); -} + template + inline void operator()(Dim& dim) { // NOLINT + UnrollAssign::Run(in_, dim.data()); + } + + const T* in_; +}; -template <> -Dim<0> make_dim<0>(const int64_t* d) { - return Dim<0>(*d); +DDim::DDim(const int* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -void make_ddim(DDim& ddim, const int64_t* dims, int n) { - switch (n) { - case 0: - ddim = make_dim<0>(dims); - break; - case 1: - ddim = make_dim<1>(dims); - break; - case 2: - ddim = make_dim<2>(dims); - break; - case 3: - ddim = make_dim<3>(dims); - break; - case 4: - ddim = make_dim<4>(dims); - break; - case 5: - ddim = make_dim<5>(dims); - break; - case 6: - ddim = make_dim<6>(dims); - break; - case 7: - ddim = make_dim<7>(dims); - break; - case 8: - ddim = make_dim<8>(dims); - break; - case 9: - ddim = make_dim<9>(dims); - break; - default: - PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); - } +DDim::DDim(const int64_t* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -/// @endcond +template +Dim make_dim(const int64_t* d) { + Dim ret; + for (int i = 0; i < N; ++i) ret[i] = d[i]; + return ret; +} DDim make_ddim(std::initializer_list dims) { - DDim result(make_dim(0)); - make_ddim(result, dims.begin(), dims.size()); - return result; + return DDim(dims.begin(), dims.size()); } DDim make_ddim(const std::vector& dims) { - DDim result(make_dim(0)); - make_ddim(result, &dims[0], dims.size()); - return result; + return DDim(dims.data(), dims.size()); } DDim make_ddim(const std::vector& dims) { - std::vector res(dims.size()); - std::transform(dims.begin(), dims.end(), res.begin(), - [](int d) { return static_cast(d); }); - return make_ddim(res); + return DDim(dims.data(), dims.size()); } -/// @cond HIDDEN -// XXX For some reason, putting this in an anonymous namespace causes errors -class DynamicMutableIndexer : public boost::static_visitor { - public: - explicit DynamicMutableIndexer(int idx) : idx_(idx) {} - - template - int64_t& operator()(Dim& dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -class DynamicConstIndexer : public boost::static_visitor { - public: - explicit DynamicConstIndexer(int idx) : idx_(idx) {} +struct DDimEqualityVisitor { + explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {} template - int64_t operator()(const Dim& dim) const { - return dim[idx_]; + inline bool operator()(const Dim& self) const { + return UnrollCompare::Run(self.data(), d_); } - private: - int idx_; + const int64_t* d_; }; -/// @endcond - -int64_t& DDim::operator[](int idx) { - return boost::apply_visitor(DynamicMutableIndexer(idx), var); -} - -int64_t DDim::operator[](int idx) const { - return boost::apply_visitor(DynamicConstIndexer(idx), var); +bool DDim::operator==(const DDim& d) const { + return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.data())); } -int DDim::size() const { return arity(*this); } - -bool DDim::operator==(DDim d) const { - if (var.which() != d.getVar().which()) { - return false; - } else { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +bool DDim::operator!=(const DDim& d) const { return !(*this == d); } - for (unsigned int i = 0; i < v1.size(); i++) { - if (v1[i] != v2[i]) { - return false; - } - } +struct DDimPlusVisitor { + explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - return true; + template + inline void operator()(Dim& self) const { + UnrollAdd::Run(d1_, d2_, self.data()); } -} - -bool DDim::operator!=(DDim d) const { return !(*this == d); } - -DDim DDim::operator+(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] + v2[i]); - } + const int64_t* d1_; + const int64_t* d2_; +}; - return make_ddim(v3); +DDim DDim::operator+(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimPlusVisitor(data(), d.data())); + return ret; } -DDim DDim::operator*(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +struct DDimMulVisitor { + explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] * v2[i]); + template + inline void operator()(Dim& self) const { + UnrollMul::Run(d1_, d2_, self.data()); } - return make_ddim(v3); + const int64_t* d1_; + const int64_t* d2_; +}; + +DDim DDim::operator*(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimMulVisitor(data(), d.data())); + return ret; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } -void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } - -/// @cond HIDDEN -struct VectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - - explicit VectorizeVisitor(std::vector& v) : vector(v) {} - - template - void operator()(const T& t) { - vector.push_back(t.head); - this->operator()(t.tail); - } - - void operator()(const Dim<0>& t) {} -}; -/// @endcond +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT std::vector vectorize(const DDim& ddim) { - std::vector result; - VectorizeVisitor visitor(result); - boost::apply_visitor(visitor, ddim); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } // NOTE: framework::vectorize converts to type int64_t // which does not fit cudnn inputs. std::vector vectorize2int(const DDim& ddim) { - std::vector temp = vectorize(ddim); - std::vector result(temp.begin(), temp.end()); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } -struct ProductVisitor : public boost::static_visitor { +struct ProductVisitor { template int64_t operator()(const Dim& dim) { return product(dim); @@ -220,65 +150,27 @@ struct ProductVisitor : public boost::static_visitor { }; int64_t product(const DDim& ddim) { - ProductVisitor visitor; - return boost::apply_visitor(visitor, ddim); + return ddim.apply_visitor(ProductVisitor()); } -struct SliceVectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - int begin; - int end; - - SliceVectorizeVisitor(std::vector& v, int b, int e) - : vector(v), begin(b), end(e) { - PADDLE_ENFORCE(begin < end, - "Begin index must be less than end index in ddim slice."); - PADDLE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - } - - template - void operator()(const Dim& dim) { - if (begin == 0) { - vector.push_back(dim.head); - } else { - --begin; - } - --end; - if (end > 0) { - this->operator()(dim.tail); - } - } - - void operator()(const Dim<0>& dim) { - PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound."); - } -}; - DDim slice_ddim(const DDim& dim, int begin, int end) { - std::vector vec; - vec.reserve(end - begin); - SliceVectorizeVisitor visitor(vec, begin, end); - boost::apply_visitor(visitor, dim); - return make_ddim(vec); -} - -/// \cond HIDDEN - -struct ArityVisitor : boost::static_visitor { - template - int operator()(Dim) const { - return D; + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); + DDim ret; + ret.rank_ = end - begin; + for (int i = 0; i < ret.rank_; ++i) { + ret[i] = dim[i + begin]; } -}; - -/// \endcond + return ret; +} -int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } +int arity(const DDim& d) { return d.size(); } /// \cond HIDDEN -struct DDimPrinter : boost::static_visitor { +struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} @@ -291,15 +183,10 @@ struct DDimPrinter : boost::static_visitor { /// \endcond std::ostream& operator<<(std::ostream& os, const DDim& ddim) { - DDimPrinter printer(os); - boost::apply_visitor(printer, ddim); + ddim.apply_visitor(DDimPrinter(os)); return os; } -DDim::DDim(std::initializer_list init_list) { - *this = make_ddim(init_list); -} - DDim flatten_to_2d(const DDim& src, int num_col_dims) { int rank = src.size(); return make_ddim({product(slice_ddim(src, 0, num_col_dims)), @@ -309,21 +196,23 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim stride(const DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; } - return framework::make_ddim(strides); + return strides; } DDim stride_numel(const framework::DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; } - return framework::make_ddim(strides); + return strides; } } // namespace framework diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index f05b5ee3fae..bff710040eb 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -18,8 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/dim.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { @@ -29,51 +27,138 @@ namespace framework { * * The number of dimensions must be between [1, 9]. */ -struct DDim { - typedef boost::variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, - Dim<7>, Dim<8>, Dim<9>> - DDimVar; - DDimVar var; +class DDim { + public: + constexpr static int kMaxRank = 9; - DDim() : var(Dim<1>()) {} + DDim() : rank_(1) { dim_[0] = 0; } + + DDim(const int* d, int n); + DDim(const int64_t* d, int n); template - explicit DDim(const Dim& in) : var(in) {} + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } - /*implicit*/ DDim(std::initializer_list init_list); + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} template - DDim& operator=(const Dim& in) { - var = in; + inline DDim& operator=(const Dim& in) { + rank_ = D; + UnsafeCast() = in; return *this; } - int64_t& operator[](int idx); - int64_t operator[](int idx) const; + inline int64_t& operator[](int idx) { return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) { - return var.apply_visitor(visitor); + inline int64_t operator[](int idx) const { return dim_[idx]; } + + inline int64_t& at(int idx) { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) const { - return var.apply_visitor(visitor); + inline int64_t at(int idx) const { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - DDimVar getVar() { return var; } + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor); + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const; + + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + DDim operator+(const DDim& d) const; - bool operator==(DDim d) const; + DDim operator*(const DDim& d) const; - bool operator!=(DDim d) const; + // Make DDim act like std::vector + using iterator = int64_t*; + using const_iterator = const int64_t*; - DDim operator+(DDim d) const; + int64_t* data() { return dim_.data(); } + const int64_t* data() const { return dim_.data(); } - DDim operator*(DDim d) const; + iterator begin() { return data(); } + const_iterator begin() const { return data(); } + iterator end() { return data() + rank_; } + const_iterator end() const { return data() + rank_; } + + int size() const { return rank_; } + + private: + template + inline Dim& UnsafeCast() { + return const_cast&>(const_cast(this)->UnsafeCast()); + } - int size() const; + template + inline const Dim& UnsafeCast() const { + static_assert(M >= 0 && M <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + friend DDim slice_ddim(const DDim& dim, int begin, int end); + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + Dim dim_; + int rank_; }; +#define PADDLE_VISIT_DDIM(rank) \ + case rank: \ + return visitor(UnsafeCast()) + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) const { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} +#undef PADDLE_VISIT_DDIM + /** * \brief Make a DDim from std::vector * @@ -92,7 +177,7 @@ DDim make_ddim(const std::vector& dims); DDim make_ddim(std::initializer_list dims); int64_t get(const DDim& dim, int idx); -void set(DDim& dim, int idx, int val); +void set(DDim& dim, int idx, int val); // NOLINT std::vector vectorize(const DDim& ddim); std::vector vectorize2int(const DDim& ddim); @@ -129,12 +214,3 @@ DDim stride(const DDim& ddim); DDim stride_numel(const DDim& ddim); } // namespace framework } // namespace paddle - -namespace boost { - -template -T get(const paddle::framework::DDim& in) { - return boost::get(in.var); -} - -} // namespace boost diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 73f92fa389f..3ae60a3119e 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -16,328 +16,184 @@ #include #include #include +#include #include +#include "paddle/fluid/framework/array.h" #include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { namespace framework { // Statically sized, statically indexed dimension -template -struct Dim { - static constexpr int dimensions = i; +template +class Dim : public Array { + public: + static_assert(N >= 0, "N must be not less than 0"); - template - HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { - static_assert(sizeof...(_tail) == i - 1, - "Dim initialized with the wrong number of parameters"); - } + static constexpr int kRank = N; + using BaseClass = Array; - HOSTDEVICE - Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } - HOSTDEVICE - Dim() : head(0), tail() {} + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} /** Construct a Dim from a linear index and size. Uses Fortran order * indexing. */ - HOSTDEVICE - Dim(int64_t idx, const Dim& size) - : head(idx % size.head), tail(idx / size.head, size.tail) {} + HOSTDEVICE Dim(int64_t idx, const Dim& size); /** Construct a Dim with each dimension set to the given index */ - HOSTDEVICE - Dim(int64_t idx) : head(idx), tail(idx) {} + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } - HOSTDEVICE - bool operator==(const Dim& o) const { - return (head == o.head) && (tail == o.tail); - } + HOSTDEVICE Dim() = default; - HOSTDEVICE - bool operator!=(const Dim& o) const { return !(*this == o); } + HOSTDEVICE int64_t* data() { return this->GetMutable(); } - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; + HOSTDEVICE const int64_t* data() const { return this->Get(); } HOST std::string to_string() const; - - int64_t head; - Dim tail; -}; - -// Base case specialization -template <> -struct Dim<0> { - static constexpr int dimensions = 0; - - HOSTDEVICE - Dim(int64_t _head) {} - - HOSTDEVICE - Dim() {} - - HOSTDEVICE - Dim(int idx, const Dim<0>& size) { -#ifndef __CUDA_ARCH__ - if (idx > 0) { - throw std::invalid_argument("Index out of range."); - } -#else - PADDLE_ASSERT(idx == 0); -#endif - } - - HOSTDEVICE - bool operator==(const Dim<0>& o) const { return true; } - - HOSTDEVICE - bool operator!=(const Dim<0>& o) const { return false; } - - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; }; -namespace { - -// Helper for accessing Dim classes -template -struct DimGetter { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return DimGetter::impl(d.tail); - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return DimGetter::impl(d.tail); +namespace detail { +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) { + out[kStart] = (*idx) % in[kStart]; + (*idx) /= in[kStart]; + FortranOrderIndexingConstructorFunctor::Run(in, idx, + out); } }; -// Eureka! We found the element! -template <> -struct DimGetter<0> { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return d.head; - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return d.head; - } +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) {} }; +} // namespace detail -template -HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); +template +HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { + detail::FortranOrderIndexingConstructorFunctor<0, N, N == 0>::Run( + size.Get(), &idx, this->GetMutable()); } -template <> -HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif +template +HOSTDEVICE inline int64_t get(const Dim& dim) { + return dim[idx]; } -template -HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif -} - -} // namespace -// Static access to constant Dim -template -HOSTDEVICE int64_t get(const Dim& d) { - return DimGetter::impl(d); -} - -// Static access to mutable Dim -template -HOSTDEVICE int64_t& get(Dim& d) { - return DimGetter::impl(d); -} - -// Dynamic access to constant Dim -template -HOSTDEVICE int64_t Dim::operator[](int i) const { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT + return dim[idx]; } -// Dynamic access to mutable Dim -template -HOSTDEVICE int64_t& Dim::operator[](int i) { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { + return dim[idx]; } -// Dynamic access to constant Dim -inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const { - return indexer(*this, i); -} - -// Dynamic access to mutable Dim -inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -// without std::enable_if will try to instantiate this on get<0>(d) -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, - int i) { - return d[i]; -} - -// Dynamic access to mutable Dim -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, - int i) { - return d[i]; +template +HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT + return dim[idx]; } // Dot product of two dims -template -HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { - return a.head * b.head + linearize(a.tail, b.tail); -} - -// Base case dot product of two Dims -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) { - return 0; +template +HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { + return UnrollProduct::Run(a.Get(), b.Get()); } // Product of a Dim -template -HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { - return prod * a.head * product(a.tail); -} - -// Base case product of a Dim -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) { - return prod; +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); } // Is 0 <= idx_i < size_i for all i? -template -HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { - return ((0 <= idx.head) && (idx.head < size.head) && - contained(idx.tail, size.tail)); -} +namespace detail { +template +struct ContainedFunctor { + HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) { + return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) && + ContainedFunctor::Run(idx, + size); + } +}; + +template +struct ContainedFunctor { + HOSTDEVICE static constexpr inline bool Run(const int64_t* idx, + const int64_t* size) { + return true; + } +}; +} // namespace detail -// Base case of is 0 <= idx_i < size_i ? -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { - return true; +template +HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { + return detail::ContainedFunctor<0, N, N == 0>::Run(idx.Get(), size.Get()); } /** * \brief Compute exclusive prefix-multiply of a Dim. */ -template -HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { - return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); -} +namespace detail { +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) { + kStart == 0 ? out[kStart] = 1 : out[kStart] = + out[kStart - 1] * in[kStart - 1]; + detail::ExPrefixMulFunctor::Run(in, + out); + } +}; + +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {} +}; +} // namespace detail -///\cond HIDDEN -// Base case of ex_prefix_mul -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { - return Dim<0>(); +template +HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { + Dim ret; + detail::ExPrefixMulFunctor<0, N, N == 0>::Run(src.Get(), ret.GetMutable()); + return ret; } -///\endcond /** * Add two dimensions together */ -template -HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { - return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +template +HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { + Dim ret; + UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); -} - -template -HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { return dim_plus(lhs, rhs); } /** * Multiply two dimensions together */ -template -HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { - return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); -} - -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); +template +HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { + Dim ret; + UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } template @@ -354,23 +210,32 @@ HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { * \return Dim object the same size as \p size with normalized strides * */ +namespace detail { +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) { + ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]); + NormalizeStridesFunctor::Run( + size, stride, ret); + } +}; -template -HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { - int norm_stride = size.head == 1 ? 0 : stride.head; - return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); -} - -///\cond HIDDEN +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) {} +}; +} // namespace detail -template <> -HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, - const Dim<0>& stride) { - return Dim<0>(); +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + Dim ret; + detail::NormalizeStridesFunctor<0, N, N == 0>::Run(size.Get(), stride.Get(), + ret.GetMutable()); + return ret; } -///\endcond - /** * Helper function to create a Dim * @@ -379,25 +244,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, */ template -HOSTDEVICE Dim make_dim(Args... idxes) { +HOSTDEVICE inline Dim make_dim(Args... idxes) { return Dim(idxes...); } // Allows us to output a Dim -// XXX For some reason, overloading fails to resolve this correctly -template -typename std::enable_if<(i > 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head << ", " << d.tail; - return os; -} - -// Base case that allows us to output a Dim -// XXX I wish this could be an overload instead of a template -template -typename std::enable_if<(i == 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head; +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < N; ++i) { + os << ", " << d[i]; + } return os; } @@ -405,25 +262,23 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { return os; } -template -HOST std::string Dim::to_string() const { +template +HOST std::string Dim::to_string() const { std::stringstream stream; - stream << *this; - return stream.str(); } -template -HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { - Dim result; +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { + Dim result; - for (int i = 0; i < D - 1; ++i) { + for (int i = 0; i < N - 1; ++i) { result[i] = linear_index % extents[i]; linear_index /= extents[i]; } - result[D - 1] = linear_index; + result[N - 1] = linear_index; return result; } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 04e3f78afe4..5014fcd06a0 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -62,7 +62,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CPUPlace &place) const { - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPU; ctx.device_id = 0; return ctx; @@ -70,7 +70,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLGPU; ctx.device_id = place.device; return ctx; @@ -81,7 +81,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPUPinned; ctx.device_id = 0; return ctx; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index 0c52bce1ef6..e48b0d5c88f 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -38,7 +38,7 @@ class DLPackTensor { // The shape in DLTensor is defined as int64_t* // Add this member to make TVMTensor init without heap allocation - ShapeType shape_[9]; + ShapeType shape_[DDim::kMaxRank]; }; } // namespace framework diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h new file mode 100644 index 00000000000..fb0a89530f6 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -0,0 +1,169 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) { + data[kStart] = val; + UnrollFillConstant::Run(data, val); + } +}; + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) {} +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) { + d2[kStart] = static_cast(d1[kStart]); + UnrollAssign::Run(d1, d2); + } +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {} +}; + +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, T val, Args... args) { + static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); + d[kStart] = val; + UnrollVarArgsAssign::Run(d, + args...); + } +}; + +template +struct UnrollVarArgsAssign { + HOSTDEVICE inline static void Run(T *d) {} +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline static bool Run(const T *d1, const T *d2) { + return d1[kStart] == d2[kStart] && + UnrollCompare::Run(d1, d2); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) { + return true; + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] + d2[kStart]; + UnrollAdd::Run(d1, d2, d3); + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] * d2[kStart]; + UnrollMul::Run(d1, d2, d3); + } +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline static T Run(const T *d) { + return d[kStart] * + UnrollProduct::Run(d); + } + + template + HOSTDEVICE inline static T Run(const T *d1, const T *d2) { + return d1[kStart] * d2[kStart] + + UnrollProduct::Run(d1, d2); + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline constexpr static T Run(const T *d) { + return 1; + } + + template + HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) { + return 0; + } +}; + +} // namespace detail + +template +using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; + +template +using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; + +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; + +template +using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; + +template +using UnrollAdd = detail::UnrollAdd<0, N, N == 0>; + +template +using UnrollMul = detail::UnrollMul<0, N, N == 0>; + +template +using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index 6446cab5ec5..2e7f3edd55c 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase { OpComment comment; PADDLE_ENFORCE(context->HasInput("X"), "Input(X) of %s operator must not be null", comment.type); - auto dim_x = context->GetInputDim("X"); - context->SetOutputDim("Out", context->GetInputDim("X")); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 2d7d33bd4f9..cfc2cac7beb 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) { } out->mutable_data(out_dims, context.GetPlace()); auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327fc..744d149714c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -378,7 +378,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { ->GetMutable(); auto input_dims = input->dims(); - auto weight_dims = weight->dims(); auto init_h_dims = init_h->dims(); auto init_c_dims = init_c->dims(); in_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 0b7c470fe72..fc223ce5593 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -27,8 +27,8 @@ struct StridedMemcpyFunctor; template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<0> src_stride, framework::Dim<0> dst_dim, - framework::Dim<0> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); @@ -50,18 +50,18 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<1> src_stride, framework::Dim<1> dst_dim, - framework::Dim<1> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); - memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { #ifdef PADDLE_WITH_CUDA auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); - memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], cuda_ctx.stream()); #else PADDLE_THROW("Paddle is not compiled with GPU"); @@ -73,19 +73,19 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim src_stride, framework::Dim dst_dim, - framework::Dim dst_stride, T* dst) const { - for (int64_t i = 0; i < dst_dim.head; ++i) { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim[0]; ++i) { StridedMemcpyFunctor func; - func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); - src += src_stride.head; - dst += dst_stride.head; + func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst); + src += src_stride[0]; + dst += dst_stride[0]; } } }; template -struct StridedCopyDimVisitor : public boost::static_visitor { +struct StridedCopyDimVisitor { StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_stride, T* dst) @@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor { dst_stride_(dst_stride), dst_(dst) {} - template - void operator()(Dim dst_dim) const { - Dim src_stride = boost::get(src_stride_); - Dim dst_stride = boost::get(dst_stride_); - constexpr int dim = Dim::dimensions; - StridedMemcpyFunctor functor; - functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + template + void operator()(const framework::Dim& dst_dim) const { + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride_.data(), dst_dim.data(), + dst_stride_.data(), dst_); } const platform::DeviceContext& dev_ctx_; diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index fddd6884017..a652d4d9575 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); - auto gt_classes_dims = ctx->GetInputDim("GtClasses"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto im_info_dims = ctx->GetInputDim("ImInfo"); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 709c2dfc4b7..f1975a9a4be 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variances"), "Input(Variances) shouldn't be null."); - auto scores_dims = ctx->GetInputDim("Scores"); - auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); - auto im_info_dims = ctx->GetInputDim("ImInfo"); - auto anchors_dims = ctx->GetInputDim("Anchors"); - auto variances_dims = ctx->GetInputDim("Variances"); - ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 46fff9d338b..fd5d75ba527 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto anchor_dims = ctx->GetInputDim("Anchor"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, "The rank of Input(Anchor) must be 2."); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156..775346c5524 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), "Rank of first input must >= rank of second input."); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 75dbf1d8bf5..33940824977 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel { auto& expand_times = context.Attr>("expand_times"); auto* out0 = context.Output("Out"); Eigen::DSizes bcast_dims; - auto x_dims = in0->dims(); for (size_t i = 0; i < expand_times.size(); ++i) { bcast_dims[i] = expand_times[i]; } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index e80249fc878..7c53e5279da 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -148,7 +148,6 @@ class FCOpKernel : public framework::OpKernel { auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); auto output = ctx.Output("Out"); - auto in_dims = input->dims(); auto w_dims = w->dims(); auto out_dims = output->dims(); int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 1eb6523a2df..9344bfe65db 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -242,15 +242,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { bool is_reverse = ctx.Attr("is_reverse"); \ bool use_peepholes = ctx.Attr("use_peepholes"); -#define INIT_BASE_SIZES \ - auto ids_dims = ids->dims(); /* T x M*/ \ - auto ids_numel = ids->numel(); /* T x 1*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - int64_t row_number = embeddings->dims()[0]; \ - int64_t row_width = embeddings->dims()[1]; \ +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = framework::product(ids_dims); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ const int D4 = wh_dims[1]; #define INIT_BASE_INPUT_DATAS \ diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 69e7fa4490b..f458ce6c83b 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel { "Input(Logits@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Logits"); - auto lab_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 9d248e03218..ef1fb83aa6e 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel { "Output(Predicted@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index 895a7019aa1..d1127ce4a24 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -37,9 +37,6 @@ void Transpose::operator()( for (int i = 0; i < Rank; i++) { permute[i] = axis[i]; } - auto in_dim = in.dims(); - auto out_dim = out->dims(); - auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); auto* dev = context.eigen_device(); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 9e99e44822b..1d9d98b1064 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -76,7 +76,6 @@ class SoftmaxFunctor> { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); - auto out_dims = Y->dims(); const float* in_data = X->data(); float* out_data = Y->data(); const int kBatchDim = 0; diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index 35db4c1ad1f..9954e51083b 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { "Input(Out@Grad) must not be null."); auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065b..154b5f0d08f 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -146,12 +146,6 @@ class MulGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) should not be null"); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - auto x_mat_dims = framework::flatten_to_2d( - x_dims, ctx->Attrs().Get("x_num_col_dims")); - auto y_mat_dims = framework::flatten_to_2d( - y_dims, ctx->Attrs().Get("y_num_col_dims")); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821dd..e58dccea131 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("Input"); auto label_dims = ctx->GetInputDim("Label"); - auto w_dims = ctx->GetInputDim("Weight"); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; if (ctx->HasInput("Bias")) { diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index d0224177ecf..6c95d3f3bf3 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel { out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); T eps = static_cast(ctx.Attr("epsilon")); int axis = ctx.Attr("axis"); if (axis < 0) axis = xdim.size() + axis; diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 1a424728f7f..5666613f6ef 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; auto in_stride = framework::stride(in_dims); - auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out->dims()); const T* input_data = in->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h index 03b59d71cc0..4bded0efb96 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h @@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { set_zero(ctx.template device_context(), x_grad, static_cast(0)); - auto out_grad_stride = framework::stride(out_grad->dims()); - for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { Tensor out_grad_t = out_grad->Slice(static_cast(out_lod[0][i]), diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c3d83a06f23..6a99ad9a90f 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& dst_stride, T* dst) { paddle::operators::detail::StridedCopyDimVisitor func( dev_ctx, src, src_stride, dst_stride, dst); - boost::apply_visitor(func, dst_dim); + dst_dim.apply_visitor(func); } // Strided numel memory copy from src to dst by the specified axis -- GitLab From 19ebd8b4cfffa2ba42c68fa4c761c54e857c6566 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:20:19 +0800 Subject: [PATCH 0329/2367] add ctc support for windows --- CMakeLists.txt | 4 ++-- cmake/external/warpctc.cmake | 30 ++++++++++++++++++++++----- cmake/operators.cmake | 2 +- paddle/fluid/operators/CMakeLists.txt | 4 +--- paddle/fluid/platform/port.h | 1 - python/paddle/fluid/__init__.py | 10 +++++++-- python/paddle/fluid/framework.py | 18 +++++++++++----- python/setup.py.in | 9 ++++---- 8 files changed, 55 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb646d3ce5d..c31f51a3f73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16a..7b937c93feb 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6c..70d159b4f35 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f9..d9b0c66e572 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -64,9 +64,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index ad070171df3..c1b81159aca 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) { static void *dlopen(const char *filename, int flag) { std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); HMODULE hModule = LoadLibrary(file_name.c_str()); if (!hModule) { throw std::runtime_error(file_name + " not found."); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b00510d4438..8f3660ca387 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,6 +102,13 @@ def __bootstrap__(): import sys import os import platform + + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core in_test = 'unittest' in sys.modules @@ -128,13 +135,12 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') if os.name != 'nt': - read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') if core.is_compiled_with_dist(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0bd78454db..b5d603d4781 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,6 +16,7 @@ from __future__ import print_function import collections import contextlib +import os import re import six import sys @@ -27,11 +28,18 @@ from .proto import framework_pb2 try: from . import core except ImportError as e: - raise ImportError( - """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" - if you encounters \"libmkldnn.so not found\" errors. If you have python - installed in other directory, replace \"/usr/local/lib\" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + if os.name == 'nt': + raise ImportError( + """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" + if you encounters \"mkldnn.dll not found\" errors. If you have python + installed in other directory, replace \"c:\python27\lib" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) + else: + raise ImportError( + """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" + if you encounters \"libmkldnn.so not found\" errors. If you have python + installed in other directory, replace \"/usr/local/lib\" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) except Exception as e: raise e from . import unique_name diff --git a/python/setup.py.in b/python/setup.py.in index cf8f28bd250..fefe8fbaa7d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -if os.name != 'nt': - package_data['paddle.libs']= [] - package_data['paddle.libs']=['libwarpctc' + ext_name] - shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + +package_data['paddle.libs']= [] +package_data['paddle.libs']=['libwarpctc' + ext_name] +shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) -- GitLab From ed5bd5e58639bfe8e584f4acdce2398701b12853 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:23:24 +0800 Subject: [PATCH 0330/2367] test=develop --- paddle/fluid/platform/dynload/CMakeLists.txt | 2 -- paddle/fluid/platform/dynload/cudnn.h | 2 +- paddle/fluid/platform/dynload/dynamic_loader.cc | 2 ++ paddle/fluid/platform/dynload/dynamic_loader.h | 6 ++++++ paddle/fluid/platform/dynload/mklml.h | 2 +- paddle/fluid/platform/dynload/tensorrt.h | 2 +- paddle/fluid/platform/dynload/warpctc.h | 2 +- 7 files changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 5939c500c94..07159d4a12e 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,9 +16,7 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) -endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 550fe2edee1..2f4f8101e4b 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using cudnn_func = decltype(&::__name); \ std::call_once(cudnn_dso_flag, []() { \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106c..eddebfe92ae 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -201,6 +201,8 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 84fd2ce9987..edb4c649add 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -18,6 +18,12 @@ namespace paddle { namespace platform { namespace dynload { +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index c3f9433503a..d0619293acf 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -34,7 +34,7 @@ extern void* mklml_dso_handle; #define DYNAMIC_LOAD_MKLML_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using mklmlFunc = decltype(&::__name); \ std::call_once(mklml_dso_flag, []() { \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 5d67658b94a..751aa54b1ad 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 18ed9956f18..bc1977b05de 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using warpctcFunc = decltype(&::__name); \ std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ -- GitLab From b73d7d2f21a4010d10b1a2456e5991d77ed5e01e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:27:14 +0800 Subject: [PATCH 0331/2367] test=develop --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index fefe8fbaa7d..22b9537a90e 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -162,7 +162,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' package_data['paddle.libs']= [] -package_data['paddle.libs']=['libwarpctc' + ext_name] +package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': -- GitLab From 63240326027b4b7469c386d16c30273ef014c09a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 15:41:41 +0800 Subject: [PATCH 0332/2367] MLP forward backward test=develop --- paddle/fluid/imperative/layer.cc | 6 +- paddle/fluid/imperative/tracer.h | 25 ++++-- paddle/fluid/operators/mul_op.cc | 3 +- paddle/fluid/pybind/imperative.cc | 5 +- python/paddle/fluid/backward.py | 7 +- python/paddle/fluid/framework.py | 3 + python/paddle/fluid/imperative/base.py | 3 +- python/paddle/fluid/imperative/layers.py | 11 ++- python/paddle/fluid/layers/nn.py | 46 +++++++++++ .../fluid/tests/unittests/test_imperative.py | 79 ++++++++++++++++++- 10 files changed, 167 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 61250376807..342cb68ab2b 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -188,11 +188,13 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; + VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + if (origin_var->var_desc_->Name() != orig_var) { + continue; + } VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 433d07c0e5a..97772dc1101 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + explicit Tracer(framework::BlockDesc* root_block, + framework::BlockDesc* startup_block) + : root_block_(root_block), startup_block_(startup_block) { root_scope_ = new framework::Scope(); scopes_[root_block_] = root_scope_; + scopes_[startup_block_] = root_scope_; } virtual ~Tracer() { delete root_scope_; } @@ -80,6 +83,8 @@ class Tracer { } else { op->pre_ops_->push_back(nullptr); } + VLOG(3) << "input vname " << vname << " " + << var->Get().dims().size(); } *op->output_vars_ = outputs; @@ -98,12 +103,19 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } + + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; + if (block == startup_block_) { + op->grad_op_desc_ = nullptr; + op->grad_to_var_ = nullptr; + } else { + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + } op->block_ = block; } @@ -121,6 +133,7 @@ class Tracer { private: std::map scopes_; framework::BlockDesc* root_block_; + framework::BlockDesc* startup_block_; framework::Scope* root_scope_; }; diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065b..271428408cb 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT( y_dims.size(), y_num_col_dims, "The input tensor Y's rank of MulOp should be larger than " - "y_num_col_dims."); + "y_num_col_dims: %ld vs %ld", + y_dims.size(), y_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 34e9c897d9e..be63fb87786 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,8 +24,9 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block, + framework::BlockDesc *startup_block) { + new (&self) imperative::Tracer(root_block, startup_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index b2c3e7c989c..6303be003a7 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, grad_to_var = dict() op_desc = _create_op_desc_( - "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { - "shape": [1], + "fill_constant", + {}, + {"Out": [_append_grad_suffix_(loss.name)]}, + { + "shape": [1], # TODO(panyx0718): This can be loss.shape. "value": 1.0, "dtype": loss.dtype, "force_cpu": False, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0bd78454db..bcffa9fe9c5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1316,6 +1316,9 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 15d38ddb56c..aa48ef71aa6 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,7 +28,8 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) + tracer = core.Tracer(train.current_block().desc, + startup.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 1a28f7f4ae3..044717c3197 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -25,11 +25,9 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self): - pass + self._built = False def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() if not isinstance(inputs, list) and not isinstance(inputs, tuple): inputs = [inputs] @@ -37,8 +35,15 @@ class PyLayer(core.Layer): for x in inputs: py_var = base.to_variable(x) var_inputs.append(py_var) + if not self._built: + self._build_once(inputs) + self._built = True + outputs = self.forward(var_inputs) return outputs + def _build_once(self, inputs): + pass + def forward(self, inputs): return [] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d8311a0d3a..771d41a5c16 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,6 +29,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core +from ..imperative import layers __all__ = [ 'fc', @@ -9426,3 +9427,48 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out + + +class FC(layers.PyLayer): + def __init__(self, + size, + param_attr=None, + num_flatten_dims=1, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__() + self._size = size + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + self._helper = LayerHelper('FC', param_attr=param_attr) + + def _build_once(self, inputs): + input_shape = inputs[0].shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, inputs): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inputs[0], + "Y": self._w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": [tmp]}, + outputs={"Out": out}, + attrs={"use_mkldnn": False}) + return out + diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index b5b6305155d..0fe69d1bd4b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,12 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import unittest -import sys import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.layers.nn import FC + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield class MyLayer(fluid.imperative.PyLayer): @@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer): return [fluid.layers.elementwise_mul(x, x)] +class MLP(fluid.imperative.PyLayer): + def __init__(self): + super(MLP, self).__init__() + self._fc1 = FC(3, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = FC(4, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs[0]) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + class TestImperative(unittest.TestCase): def test_layer(self): with fluid.imperative.guard(): @@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase): l.forward([]) def test_layer_in_out(self): + np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + x = l(np_inp)[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + dy_out = x._numpy() x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + dy_grad = l._x_for_debug._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[3], append_batch_size=False) + l = MyLayer() + x = l(inp)[0] + param_grads = fluid.backward.append_backward( + x, parameter_list=[l._x_for_debug.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[x.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) + + def test_mlp(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.imperative.guard(): + mlp = MLP() + out = mlp(np_inp) + dy_out = out._numpy() + out._backward() + dy_grad = mlp._fc1._w._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + mlp = MLP() + out = mlp(inp) + param_grads = fluid.backward.append_backward( + out, parameter_list=[mlp._fc1._w.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[out.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) if __name__ == '__main__': -- GitLab From e0c3c56b0664ee92e5eb86dca810c029e5cd1d67 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 20:29:49 +0800 Subject: [PATCH 0333/2367] add nce remote ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 68 ++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index 5e440bf35d2..b5f93f93a1b 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -27,6 +27,45 @@ from paddle.fluid.op import Operator from paddle.fluid.framework import Program, program_guard +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sample_out = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sample_out[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + + for i in range(len(samples)): + o = sample_out[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out[:, np.newaxis], np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + def run_pserver(pserver_id, use_cuda, sync_mode): scope = fluid.core.Scope() program = Program() @@ -94,11 +133,11 @@ class TestListenAndServOp(unittest.TestCase): with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): x = scope.var('Input').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 + x_array = np.random.random((4, 8)).astype("float32") x.set(x_array, place) # create and initialize Param Variable param = scope.var('Weight').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 + param_array = np.zeros((5, 8)).astype("float32") param.set(param_array, place) bias = scope.var('Bias').get_tensor() @@ -110,7 +149,7 @@ class TestListenAndServOp(unittest.TestCase): sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) + label_array = np.array([[0], [1], [4], [3]]) label.set(label_array, place) cost = scope.var('Cost').get_tensor() @@ -122,7 +161,7 @@ class TestListenAndServOp(unittest.TestCase): sample_l.set(sample_l_w, place) sample_la = scope.var('SampleLabels').get_tensor() - sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la_w = np.zeros((4, 3)).astype("int") sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] @@ -139,11 +178,12 @@ class TestListenAndServOp(unittest.TestCase): Cost='Cost', SampleLogits='SampleLogits', SampleLabels='SampleLabels', + SampleWeight='SampleWeight', num_total_classes=5, num_neg_samples=2, custom_neg_classes=list(range(2)), sampler=0, - seed=1, + seed=0, is_sparse=True, remote_prefetch=True, epmap=emaps, @@ -153,9 +193,21 @@ class TestListenAndServOp(unittest.TestCase): nce_op.run(scope, place) # get and compare result - o_cost = np.array(cost_w) - o_logits = np.array(sample_l) - o_labels = np.array(sample_la) + o_cost = np.array(scope.var('Cost').get_tensor()) + o_logits = np.array(scope.var('SampleLogits').get_tensor()) + o_labels = np.array(scope.var('SampleLabels').get_tensor()) + + param_array = np.ones((5, 8)).astype("float32") + for i in range(2): + param_array[i] *= param_array[i] * i + 0 * 10 + 1 + for i in range(2, 5): + param_array[i] *= param_array[i] * i + 1 * 10 + 1 + out = nce(x_array, param_array, bias_array, sample_weight, + label_array, 5, 2) + + self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) + self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) + self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" -- GitLab From 3f9c429ea0b41292db0b80f4a150b62d2e78b192 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 20:31:23 +0800 Subject: [PATCH 0334/2367] fix lint test=develop --- python/paddle/fluid/layers/nn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 771d41a5c16..d8bc919784b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9471,4 +9471,3 @@ class FC(layers.PyLayer): outputs={"Out": out}, attrs={"use_mkldnn": False}) return out - -- GitLab From b2f789c66dc847d9fbc030a2db218be670e7752f Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 12:47:58 +0000 Subject: [PATCH 0335/2367] add test transpiler dist test, test=develop --- .../tests/unittests/test_dist_transpiler.py | 43 +++++++++++++++---- .../fluid/transpiler/distribute_transpiler.py | 2 +- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 27575897b54..f572d692778 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -879,29 +879,36 @@ class TestRemoteNce(TestDistLookupTableBase): class TestRemoteHsigmoid(TestDistLookupTableBase): def network_with_table(self, is_sparse, is_distributed): - num_total_classes = 10 + num_total_classes = 3 - input = fluid.layers.data(name="input", shape=[10], dtype="float32") + input = fluid.layers.data(name="input", shape=[1], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") path_table = fluid.layers.data( - name='path_table', shape=[10], dtype='int64') + name='path_table', shape=[3], dtype='int64') path_code = fluid.layers.data( - name='path_code', shape=[10], dtype='int64') + name='path_code', shape=[3], dtype='int64') w_param = fluid.default_main_program().global_block().create_parameter( shape=[num_total_classes, 10], dtype='float32', name='hs_w', initializer=fluid.initializer.ConstantInitializer()) b_param = fluid.default_main_program().global_block().create_parameter( - shape=[num_total_classes, 1], + shape=[3, 1], dtype='float32', name='hs_b', initializer=fluid.initializer.ConstantInitializer()) - cost = fluid.layers.hsigmoid( + emb = fluid.layers.embedding( input=input, + is_sparse=is_sparse, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(num_total_classes)))) + + cost = fluid.layers.hsigmoid( + input=emb, label=label, - num_classes=non_leaf_num, + num_classes=num_total_classes, path_table=path_table, path_code=path_code, is_custom=True, @@ -918,9 +925,29 @@ class TestRemoteHsigmoid(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + params_to_check = list() for op in trainer.blocks[0].ops: - if op.type == "recv": + if op.type == "hierarchical_sigmoid": + params_to_check = [op.input("W")[0], op.input("Bias")[0]] + for name in ["epmap", "table_names", "epmap"]: + assert op.has_attr(name) + if name == "epmap": + assert op.attr(name)[0] == u'127.0.0.1:6174' + elif name == "table_names": + assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0' + else: + assert op.attr(name) == 3 + elif op.type == "lookup_table": + params_to_check.append(op.input("W")[0]) + else: pass + op_count = 0 + for op in trainer.blocks[0].ops: + if op.type == "recv": + assert len(op.output("Out")) == 1 + assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0' + op_count += 1 + assert op_count == 1 if __name__ == "__main__": diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 378654ab5b1..f5ca3dffb73 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,7 +242,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table", "nce"] + sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True: -- GitLab From 19a8d965858173789376248b076fc0339422d313 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 13:18:11 +0000 Subject: [PATCH 0336/2367] fix nce in test_dist_transpiler, test=develop --- python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 73795a21549..0555db4cba4 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -871,8 +871,8 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() - out_vars = ["nce_w.block0", "nce_w.block1"] - in_vars = ["nce_b.block0", "nce_b.block1"] + out_vars = ["nce_w"] + in_vars = ["nce_b"] recv_var_names = [] -- GitLab From f7fb937bfe64a1017f0b4c87706e6655764c775d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 21:29:47 +0800 Subject: [PATCH 0337/2367] fix in cmake, test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8a..950029ed94f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,6 +21,8 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) + LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) + LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) @@ -32,7 +34,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 -- GitLab From 6648995f53d69ae1a1b9f0cf4b9fef000fadd54b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 15:32:55 +0000 Subject: [PATCH 0338/2367] fix build --- paddle/fluid/operators/crf_decoding_op.h | 4 +- .../elementwise/elementwise_mul_mkldnn_op.cc | 2 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 46 ++++++++-------- .../fluid/operators/fused/fusion_lstm_op.cc | 55 +++++++++---------- paddle/fluid/operators/jit/helper.h | 8 +-- paddle/fluid/operators/layer_norm_op.h | 2 +- paddle/fluid/operators/math/fc_compute.h | 5 +- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 860d71e1fe6..9b90ba749bd 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,8 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get( - tag_num); + auto ker = jit::Get, + platform::CPUPlace>(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index 71f4b71330a..2ade5818a99 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -108,7 +108,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, platform::CPUPlace>(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index d44a7ad83e8..3f27ccb2ef4 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -183,29 +183,29 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get(attr); \ - auto ComputeHtPart1 = \ - jit::Get(attr); \ - auto ComputeHtPart2 = \ - jit::Get(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeHtPart1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeHtPart2 = \ + jit::Get, platform::CPUPlace>(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index a62f4d18c2b..a3f021ed9d0 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -236,33 +236,32 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit \ - : lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - math::jitkernel::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get(attr); \ - auto ComputeCtHt = \ - jit::Get(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeCtHt = \ + jit::Get, platform::CPUPlace>(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -434,7 +433,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = cur_prev_c_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ComputeC1H1(&one_step, &attr); + ComputeCtHt(&one_step, &attr); // move one batch cur_in_data += D4; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 275170ca2b5..38bc7cd8e89 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -32,7 +32,7 @@ inline typename std::enable_if< std::is_same::value && std::is_same::value, typename KernelTuples::func_type>::type -GetJitCode(typename KernelTuples::attr_type attr) { +GetJitCode(const typename KernelTuples::attr_type& attr) { using Func = typename KernelTuples::func_type; using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); @@ -68,7 +68,7 @@ inline typename std::enable_if< !std::is_same::value || !std::is_same::value, typename KernelTuples::func_type>::type -GetJitCode(typename KernelTuples::attr_type attr) { +GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } @@ -93,8 +93,8 @@ inline typename KernelTuples::func_type GetRefer() { template -// TODO(TJ): const & attr -typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { +typename KernelTuples::func_type Get( + const typename KernelTuples::attr_type& attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index bb00ed47293..0651dbf6a93 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -230,7 +230,7 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(bias->numel(), right); auto ker = - jit::Get( + jit::Get, platform::CPUPlace>( right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 5e3093c69d3..bdb0d8511c9 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -31,13 +31,14 @@ inline void FCCompute(const BlasT& blas, const int M, } if (relu) { auto compute = - jit::Get(N); + jit::Get, platform::CPUPlace>(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::Get(N); + auto compute = + jit::Get, platform::CPUPlace>(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif -- GitLab From 10c340c9a39e3ca54268cb4d7134bef514ff689c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 14:40:34 +0000 Subject: [PATCH 0339/2367] fix confilcts --- paddle/fluid/operators/jit/gen/act.cc | 41 ++++++++++++++------------- paddle/fluid/operators/jit/gen/act.h | 1 - 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index f3332cbefa7..391cf57d8a7 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -21,26 +21,27 @@ namespace operators { namespace jit { namespace gen { -const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), - REPEAT_8TIMES(2.f), - REPEAT_8TIMES(0.5f), - REPEAT_8TIMES(EXP_HIG), - REPEAT_8TIMES(EXP_LOW), - REPEAT_8TIMES(CEPHES_LOG2EF), - REPEAT_8TIMES(CEPHES_EXP_C1), - REPEAT_8TIMES(CEPHES_EXP_C2), - REPEAT_8TIMES(CEPHES_EXP_P0), - REPEAT_8TIMES(CEPHES_EXP_P1), - REPEAT_8TIMES(CEPHES_EXP_P2), - REPEAT_8TIMES(CEPHES_EXP_P3), - REPEAT_8TIMES(CEPHES_EXP_P4), - REPEAT_8TIMES(CEPHES_EXP_P5), - REPEAT_8TIMES(EXP_MAX_INPUT), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; - -const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; -int g_tmp_mem[16] ALIGN32 = {0}; +const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = { + REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; + +const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)}; +int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0}; void VActJitCode::genCode() { int offset = 0; diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h index 63dee7bc0dd..c35579c3adf 100644 --- a/paddle/fluid/operators/jit/gen/act.h +++ b/paddle/fluid/operators/jit/gen/act.h @@ -27,7 +27,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -#define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 -- GitLab From 4cc7707d281931d254a377e29c5f9fe37a6a993a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 14:05:19 +0000 Subject: [PATCH 0340/2367] add crf_decoding and layer norm intrisic code --- .../fluid/operators/jit/more/CMakeLists.txt | 4 + .../jit/more/intrinsic/CMakeLists.txt | 9 ++ .../jit/more/intrinsic/crf_decoding.cc | 138 ++++++++++++++++++ .../jit/more/intrinsic/crf_decoding.h | 89 +++++++++++ paddle/fluid/operators/jit/more/more.h | 15 -- 5 files changed, 240 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h delete mode 100644 paddle/fluid/operators/jit/more/more.h diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt index 5bb78b93045..a740d1a840f 100644 --- a/paddle/fluid/operators/jit/more/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/CMakeLists.txt @@ -7,4 +7,8 @@ if(WITH_MKLML) add_subdirectory(mkl) endif() +if(WITH_AVX) + add_subdirectory(intrinsic) +endif() + set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt new file mode 100644 index 00000000000..c4a50138636 --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -0,0 +1,9 @@ + +file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base) + +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) + +# use mkl kernels by name and type +USE_JITKERNEL_MORE(crfdecoding, intrinsic) +USE_JITKERNEL_MORE(layernorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc new file mode 100644 index 00000000000..016fca38686 --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" +#include "paddle/fluid/operators/jit/refer/refer.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace more { +namespace mkl { + +template <> +void VMul(const float* x, const float* y, float* z, int n) { + platform::dynload::vsMul(n, x, y, z); +} + +template <> +void VMul(const double* x, const double* y, double* z, int n) { + platform::dynload::vdMul(n, x, y, z); +} + +template <> +void VAdd(const float* x, const float* y, float* z, int n) { + platform::dynload::vsAdd(n, x, y, z); +} + +template <> +void VAdd(const double* x, const double* y, double* z, int n) { + platform::dynload::vdAdd(n, x, y, z); +} + +template <> +void VScal(const float* a, const float* x, float* y, int n) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VScal(const double* a, const double* x, double* y, int n) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VExp(const float* x, float* y, int n) { + platform::dynload::vsExp(n, x, y); +} + +template <> +void VExp(const double* x, double* y, int n) { + platform::dynload::vdExp(n, x, y); +} + +// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 +template <> +bool VMulKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VAddKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VScalKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VExpKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VSigmoidKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VTanhKernel::UseMe(int d) const { + return d > 7; +} + +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::UseMe(int d) const { \ + return true; \ + } + +AWALYS_USE_ME_WITH_DOUBLE(VMul); +AWALYS_USE_ME_WITH_DOUBLE(VAdd); +AWALYS_USE_ME_WITH_DOUBLE(VScal); +AWALYS_USE_ME_WITH_DOUBLE(VExp); +AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); +AWALYS_USE_ME_WITH_DOUBLE(VTanh); + +#undef AWALYS_USE_ME_WITH_DOUBLE +} // namespace mkl +} // namespace more +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace mkl = paddle::operators::jit::more::mkl; + +#define REGISTER_MKL_KERNEL(key, func) \ + REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ + mkl::func##Kernel) + +REGISTER_MKL_KERNEL(vmul, VMul); +REGISTER_MKL_KERNEL(vadd, VAdd); +REGISTER_MKL_KERNEL(vscal, VScal); +REGISTER_MKL_KERNEL(vexp, VExp); +REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); +REGISTER_MKL_KERNEL(vtanh, VTanh); + +#undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h new file mode 100644 index 00000000000..bf209d2f9d2 --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jit/kernel_base.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace more { +namespace mkl { + +template +void VMul(const T* x, const T* y, T* z, int n); + +template +void VAdd(const T* x, const T* y, T* z, int n); + +template +void VScal(const T* a, const T* x, T* y, int n); + +template +void VExp(const T* x, T* y, int n); + +template +void VSigmoid(const T* x, T* y, int n) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + VExp(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template +void VTanh(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +#define DECLARE_MKL_KERNEL(name, tuples) \ + template \ + class name##Kernel : public KernelImpl> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(typename tuples::attr_type) const override; \ + } + +// XYZN +DECLARE_MKL_KERNEL(VMul, XYZNTuples); +DECLARE_MKL_KERNEL(VAdd, XYZNTuples); + +// AXYN +DECLARE_MKL_KERNEL(VScal, AXYNTuples); + +// XYN +DECLARE_MKL_KERNEL(VExp, XYNTuples); +DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); +DECLARE_MKL_KERNEL(VTanh, XYNTuples); + +#undef DECLARE_MKL_KERNEL + +} // namespace mkl +} // namespace more +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/more/more.h b/paddle/fluid/operators/jit/more/more.h deleted file mode 100644 index ab99fdc05f9..00000000000 --- a/paddle/fluid/operators/jit/more/more.h +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once -- GitLab From b1516783ea98c9770761be59b1de515a5bbfc521 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 15:29:25 +0000 Subject: [PATCH 0341/2367] enable crf decoding intrinsic code --- paddle/fluid/operators/jit/README.md | 4 + .../jit/more/intrinsic/CMakeLists.txt | 1 - .../jit/more/intrinsic/crf_decoding.cc | 241 ++++++++++-------- .../jit/more/intrinsic/crf_decoding.h | 68 +---- 4 files changed, 150 insertions(+), 164 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index ce31f18b63c..1264bc96ee6 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -19,6 +19,10 @@ PaddlePaddle/Paddle/paddle/fluid/ │ ├── ... │ ├── mkl/ │ │ └── ... + │ ├── mkldnn/ + │ │ └── ... + │ ├── intrinsic/ + │ │ └── ... │ └── openblas/ │ └── ... └── refer/ diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt index c4a50138636..de83d80e775 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -6,4 +6,3 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) # use mkl kernels by name and type USE_JITKERNEL_MORE(crfdecoding, intrinsic) -USE_JITKERNEL_MORE(layernorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 016fca38686..17b5eaf13df 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" -#include "paddle/fluid/operators/jit/refer/refer.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -21,118 +21,151 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { - -template <> -void VMul(const float* x, const float* y, float* z, int n) { - platform::dynload::vsMul(n, x, y, z); -} - -template <> -void VMul(const double* x, const double* y, double* z, int n) { - platform::dynload::vdMul(n, x, y, z); -} - -template <> -void VAdd(const float* x, const float* y, float* z, int n) { - platform::dynload::vsAdd(n, x, y, z); -} - -template <> -void VAdd(const double* x, const double* y, double* z, int n) { - platform::dynload::vdAdd(n, x, y, z); -} - -template <> -void VScal(const float* a, const float* x, float* y, int n) { - if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); +namespace intrinsic { + +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num) { + const int step_size = + platform::MayIUse(platform::avx512f) ? ZMM_FLOAT_BLOCK : YMM_FLOAT_BLOCK; + const int end = tag_num / step_size; + const int rest = tag_num % step_size; + /* Setup the alpha initial value.*/ + int i_offset = 0; + int last_offset = rest - step_size; + for (int i = 0; i <= end; ++i) { +#ifdef __AVX512F__ + // Declare the variable for the content of weights, input and alpha values. + __m512 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm512_loadu_ps(w + i_offset); + x_content = _mm512_loadu_ps(x + i_offset); + alpha_content = _mm512_add_ps(w_content, x_content); + // Save the alpha value. + _mm512_storeu_ps(alpha_value + i_offset, alpha_content); +#else + // AVX or AVX2 + // weights, input and alpha values. + __m256 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm256_loadu_ps(w + i_offset); + x_content = _mm256_loadu_ps(x + i_offset); + alpha_content = _mm256_add_ps(w_content, x_content); + _mm256_storeu_ps(alpha + i_offset, alpha_content); +#endif + i_offset += step_size; + if (i == end - 1) { + if (rest > 0) { + i_offset += last_offset; + } else { + break; + } + } } -} - -template <> -void VScal(const double* a, const double* x, double* y, int n) { - if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); + // Use the column-major strategy to get the location of maximum score. + int seq_offset = 0; + constexpr int state_trans_base_idx = 2; + for (int k = 1; k < seq_len; ++k) { + int j_offset = 0; + for (int j = 0; j <= end; ++j) { +/* Initialize the variables of maximum score and location.*/ +#ifdef __AVX512F__ + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); + __m512i max_j = _mm512_setzero_si512(); +#else + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); + __m256i max_j = _mm256_set1_epi32(0); +#endif + /* Calculate the offset of transition_weights.*/ + int trans_offset = state_trans_base_idx * tag_num + j_offset; + for (int i = 0; i < tag_num; ++i) { +/* Initalize the content of alpha variable with related offset.*/ +#ifdef __AVX512F__ + __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); + /* Obtain the content of weights from un-aligned address.*/ + __m512 w_content = _mm512_loadu_ps(w + trans_offset); + __m512 score_v = _mm512_add_ps(alpha_content, w_content); + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); + /* AVX512 instructions.*/ + max_j = _mm512_mask_set1_epi32(max_j, mask, i); + /* Update the max_score value.*/ + max_score = _mm512_max_ps(max_score, score_v); + +#else + __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); + /* Obtain the content of weights from un-aligned address.*/ + __m256 w_content = _mm256_loadu_ps(w + trans_offset); + __m256 score_v = _mm256_add_ps(alpha_content, w_content); + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); +/* According to the mask value, update the index of the max_score.*/ +#ifdef __AVX2__ + max_j = _mm256_or_si256( + _mm256_andnot_si256((__m256i)mask, max_j), + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); +#else + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); + __m128i lo_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 0); // NOLINT + __m128i hi_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 1); // NOLINT + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); +#endif + /* Update the max_score value.*/ + max_score = _mm256_max_ps(max_score, score_v); + +#endif + + trans_offset += tag_num; + } +/* Update the alpha and track values. */ +#ifdef __AVX512F__ + __m512 x_content = + _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); + max_score = _mm512_add_ps(max_score, x_content); + _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + + this->num_ + j_offset), + max_j); +#else + __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset); + max_score = _mm256_add_ps(max_score, x_content); + _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset), + max_j); +#endif + + /* Calculate the offset of next step*/ + j_offset += step_size; + if (j == end - 1) { + if (rest > 0) { + j_offset += last_offset; + } else { + break; + } + } + } + seq_offset += tag_num; } } -template <> -void VExp(const float* x, float* y, int n) { - platform::dynload::vsExp(n, x, y); -} - -template <> -void VExp(const double* x, double* y, int n) { - platform::dynload::vdExp(n, x, y); -} - -// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool VMulKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; +bool CRFDecodingKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx); } -template <> -bool VAddKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VScalKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VExpKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VSigmoidKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VTanhKernel::UseMe(int d) const { - return d > 7; -} - -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(int d) const { \ - return true; \ - } - -AWALYS_USE_ME_WITH_DOUBLE(VMul); -AWALYS_USE_ME_WITH_DOUBLE(VAdd); -AWALYS_USE_ME_WITH_DOUBLE(VScal); -AWALYS_USE_ME_WITH_DOUBLE(VExp); -AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); -AWALYS_USE_ME_WITH_DOUBLE(VTanh); - -#undef AWALYS_USE_ME_WITH_DOUBLE -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators } // namespace paddle -namespace mkl = paddle::operators::jit::more::mkl; - -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ - mkl::func##Kernel) - -REGISTER_MKL_KERNEL(vmul, VMul); -REGISTER_MKL_KERNEL(vadd, VAdd); -REGISTER_MKL_KERNEL(vscal, VScal); -REGISTER_MKL_KERNEL(vexp, VExp); -REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); -REGISTER_MKL_KERNEL(vtanh, VTanh); +namespace intrinsic = paddle::operators::jit::more::intrinsic; -#undef REGISTER_MKL_KERNEL +REGISTER_JITKERNEL_MORE(crfdecoding, intrinsic, intrinsic::CRFDecodingKernel); diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index bf209d2f9d2..a4081cfc34b 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -21,68 +21,18 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { +namespace intrinsic { -template -void VMul(const T* x, const T* y, T* z, int n); +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num); -template -void VAdd(const T* x, const T* y, T* z, int n); +class CRFDecodingKernel : public KernelImpl> { + public: + CRFDecodingKernel() { this->func = CRFDecoding; } + bool UseMe(typename CRFDecodingTuples::attr_type) const override; +}; -template -void VScal(const T* a, const T* x, T* y, int n); - -template -void VExp(const T* x, T* y, int n); - -template -void VSigmoid(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExp(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanh(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelImpl> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(typename tuples::attr_type) const override; \ - } - -// XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); - -// AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); - -// XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); - -#undef DECLARE_MKL_KERNEL - -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators -- GitLab From 62eb43ba98931f303127441b0f53f142b12f439f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 20:22:56 +0800 Subject: [PATCH 0342/2367] convert more test=develop --- paddle/fluid/framework/operator.cc | 35 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8c83748668e..5bee6b41bde 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, const Scope& scope) { for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; + input_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; + output_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { output_vars.push_back(scope.FindVar(var_name)); } @@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasOutput(const std::string& name) const override { // has only one output - const auto& outs = op_.Outputs(); + const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end()) { return false; } const auto& out = it->second; - if (out.size() == 0 || out[0] == kEmptyVarName) { + if (out.size() == 0) { return false; } PADDLE_ENFORCE_EQ(out.size(), 1UL, "Output %s should not have more than one outputs", name); - return scope_.FindVar(out[0]) != nullptr; + return out[0] != nullptr; } bool HasInputs(const std::string& name) const override { - if (!op_.HasInputs(name)) { - return false; - } - auto inputs = op_.Inputs(name); - if (inputs.empty()) { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end()) { return false; } - for (auto& input : inputs) { - if (scope_.FindVar(input) == nullptr) { + for (auto& input : it->second) { + if (input == nullptr) { return false; } } @@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasOutputs(const std::string& name) const override { - if (!op_.HasOutputs(name)) { - return false; - } - auto outputs = op_.Outputs(name); - if (outputs.empty()) { + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end()) { return false; } - for (auto& output : outputs) { - if (scope_.FindVar(output) == nullptr) { + for (auto& output : it->second) { + if (output == nullptr) { return false; } } @@ -864,8 +862,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; - auto* var = scope.FindVar(var_name); - input_vars[i] = var; + auto* var = input_vars[i]; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { -- GitLab From 0e0983cc1d9a607ba8a339bbbe9e495e304cd11f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 21:27:04 +0800 Subject: [PATCH 0343/2367] convert more infer shape --- paddle/fluid/framework/operator.cc | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5bee6b41bde..a7bee3344df 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -614,16 +614,19 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareDim(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) override { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - const std::string& input_n = Inputs(in)[i]; - const std::string& output_n = Outputs(out)[j]; + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second[i]; + Variable* out_var = out_it->second[j]; - Variable* in_var = scope_.FindVar(input_n); - Variable* out_var = scope_.FindVar(output_n); PADDLE_ENFORCE(in_var->Type() == out_var->Type(), - "The type of %s and %s is not the same.", output_n, - GetDim(input_n)); + "The type of %s and %s is not the same.", in_var->Type(), + out_var->Type()); if (in_var->IsType()) { auto& in_sele_rows = in_var->Get(); @@ -644,13 +647,16 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) const override { - const std::vector& inputs = Inputs(in); - const std::vector& outputs = Outputs(out); - PADDLE_ENFORCE_LT(i, inputs.size()); - PADDLE_ENFORCE_LT(j, outputs.size()); - Variable* in_var = scope_.FindVar(inputs.at(i)); + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; - Variable* out_var = scope_.FindVar(outputs.at(j)); + Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE(out_var->IsType(), "The %d-th output of Output(%s) must be LoDTensor.", j, out); auto in_tensor = in_var->Get(); -- GitLab From 52d3903a1208747c2e3c97b90bb0f48e08f7a85b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 19 Dec 2018 10:03:36 +0800 Subject: [PATCH 0344/2367] fix test=develop --- paddle/fluid/framework/operator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a7bee3344df..e023d165b03 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -575,7 +575,7 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasInputs(const std::string& name) const override { const auto& ins = ctx_.inputs; auto it = ins.find(name); - if (it == ins.end()) { + if (it == ins.end() || it->second.empty()) { return false; } for (auto& input : it->second) { @@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasOutputs(const std::string& name) const override { const auto& outs = ctx_.outputs; auto it = outs.find(name); - if (it == outs.end()) { + if (it == outs.end() || it->second.empty()) { return false; } for (auto& output : it->second) { -- GitLab From 2f3b5054ad9a4fb0f62450c6dca912e0c1306471 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 19 Dec 2018 10:31:40 +0800 Subject: [PATCH 0345/2367] fix build script --- python/setup.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index bfbaa1d0157..521d108b2c0 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -156,8 +156,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' +package_data['paddle.libs']= [] if os.name != 'nt': - package_data['paddle.libs']= [] package_data['paddle.libs']=['libwarpctc' + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) @@ -169,7 +169,7 @@ else: if os.name == 'nt': # copy the openblas.dll shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path) - package_data['paddle.fluid'] += ['openblas' + ext_name] + package_data['paddle.libs'] += ['openblas' + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': -- GitLab From 4f6e9e3ac30ed4b3489394b79f0b1dd607432b93 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 10:48:55 +0800 Subject: [PATCH 0346/2367] teacher student sigmoid loss --- .../teacher_student_sigmoid_loss_op.cc | 256 ++++++++++++++++++ .../teacher_student_sigmoid_loss_op.h | 25 ++ python/paddle/fluid/layers/nn.py | 42 +++ .../test_teacher_student_sigmoid_loss_op.py | 70 +++++ 4 files changed, 393 insertions(+) create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc new file mode 100644 index 00000000000..98eafb9f84e --- /dev/null +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(label_dims[1], 1UL, + "The 2nd dimension of " + "Input(Label) should be 1."); + ctx->SetOutputDim("Y", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of + // teacher_student_sigmoid_loss + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class TeacherStudentSigmoidLossGradientOp + : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], + "The 1st dimension of Input(X) and Input(Y@Grad) should " + "be equal."); + PADDLE_ENFORCE_EQ(dy_dims[1], 1, + "The 2nd dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of + // teacher_student_sigmoid_loss + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class TeacherStudentSigmoidLossOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape [N x 1]," + " where N is the batch size and D is the output. " + "This input is a probability computed by the previous operator, " + "which is almost always the result of a softmax operator."); + AddInput("Label", + "(Tensor), the ground truth which is a 2-D tensor. " + "Label is a Tensor with shape [N x 1]. "); + AddOutput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape " + "[N x 1]. The teacher student sigmoid loss."); + AddAttr("soft_max_up_bound", "fp32, default 15.0").SetDefault(15.0); + AddAttr("soft_max_lower_bound", "fp32, default -15.0") + .SetDefault(-15.0); + AddComment(R"DOC( +TeacherStudentSigmoidLoss Operator. +TeacherStudentSigmoidLoss Operator. + +It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that +we add another label(z') to original. + loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) + z is click or not + z' is value q of feed_fine + label = {-2, -1, [0, 2]} + when z' is not exist, clk = 0 : label = -2; + when z' is not exist, clk = 1 : label = -1; + when z' is exist , clk = 0 : label = 0 + z'; + when z' is exist , clk = 1 : label = 1 + z'; + +)DOC"); + } +}; + +// template +template +class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), + "This kernel only runs on CPU."); + + Tensor* y = context.Output("Y"); + const Tensor* x = context.Input("X"); + const Tensor* labels = context.Input("Label"); + T* y_data = y->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + const T* label_data = labels->data(); + int64_t batch_size = x->dims()[0]; + // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + + // log(1 + exp(-abs(x))) + // z is click or not + // z' is value q of feed_fine + // label = {-2, -1, [0, 2]} + // when z' is not exist, clk = 0 : label = -2; + // when z' is not exist, clk = 1 : label = -1; + // when z' is exist , clk = 0 : label = 0 + z'; + // when z' is exist , clk = 1 : label = 1 + z'; + for (int i = 0; i < batch_size; ++i) { + if (label_data[i] < -1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 0.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else if (label_data[i] < 1.0) { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * label_data[i] + + log(1.0 + exp(-fabs(x_data[i]))); + } else { + y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] + + log(1.0 + exp(-fabs(x_data[i]))) + + (x_data[i] > 0 ? x_data[i] : 0.0) - + x_data[i] * (label_data[i] - 1.0) + + log(1.0 + exp(-fabs(x_data[i]))); + } + } + } +}; + +template +class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const T* x_data = x->data(); + + Tensor* dx = context.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(context.GetPlace()); + + const Tensor* labels = context.Input("Label"); + const T* label_data = labels->data(); + + T soft_max_up_bound = + static_cast(context.Attr("soft_max_up_bound")); + T soft_max_lower_bound = + static_cast(context.Attr("soft_max_lower_bound")); + + int64_t batch_size = x->dims()[0]; + + const framework::Tensor* dOut = + context.Input(framework::GradVarName("Y")); + + const T* dout_data = dOut->data(); + + for (int i = 0; i < batch_size; ++i) { + T sum_val = x_data[i]; + if (sum_val > soft_max_up_bound) { + sum_val = soft_max_up_bound; + } else { + if (sum_val < soft_max_lower_bound) { + sum_val = soft_max_lower_bound; + } + } + + T pred = 1.0 / (1.0 + exp(-sum_val)); + if (label_data[i] < -1.0) { + dx_data[i] = 0.0 - pred; + } else if (label_data[i] < 0.0) { + dx_data[i] = 1.0 - pred; + } else { + dx_data[i] = label_data[i] - 2.0 * pred; + } + if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) { + dx_data[i] = 0; + } + dx_data[i] *= dout_data[i] * -1; + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(teacher_student_sigmoid_loss, + ops::TeacherStudentSigmoidLossOp, + ops::TeacherStudentSigmoidLossOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad, + ops::TeacherStudentSigmoidLossGradientOp); + +REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss, + ops::TeacherStudentSigmoidLossOpKernel, + ops::TeacherStudentSigmoidLossOpKernel); + +REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad, + ops::TeacherStudentSigmoidLossGradOpKernel, + ops::TeacherStudentSigmoidLossGradOpKernel); diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h new file mode 100644 index 00000000000..77b2760e9cc --- /dev/null +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9e6cd1a0ab5..68243cf744f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -176,6 +176,7 @@ __all__ = [ 'get_tensor_from_selected_rows', 'lstm', 'psroi_pool', + 'teacher_student_sigmoid_loss', ] kIgnoreIndex = -100 @@ -9184,6 +9185,47 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss +def teacher_student_sigmoid_loss(input, + label, + soft_max_up_bound=15.0, + soft_max_lower_bound=-15.0): + """ + **Teacher Student Log Loss Layer** + + This layer accepts input predictions and target label and returns the + teacher_student loss. + + .. math:: + loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) + + Args: + input (Variable|list): a 2-D tensor with shape [N x 1], where N is the + batch size. This input is a probability computed + by the previous operator. + label (Variable|list): the ground truth which is a 2-D tensor with + shape [N x 1], where N is the batch size. + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound + + Returns: + Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss. + + Examples: + .. code-block:: python + cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label) + """ + helper = LayerHelper('teacher_student_sigmoid_loss', **locals()) + out = helper.create_variable(dtype=input.dtype) + helper.append_op( + type='teacher_student_sigmoid_loss', + inputs={'X': [input], + 'Label': [label]}, + outputs={'Y': [out]}, + attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \ + "soft_max_up_bound": float(soft_max_up_bound)}) + return out + + def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py new file mode 100644 index 00000000000..faa5163b320 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from math import log +from math import exp +from op_test import OpTest +from scipy.special import logit +from scipy.special import expit +import unittest + + +class TestTeacherStudentSigmoidLossOp(OpTest): + """ + Test teacher_student_sigmoid_loss with discrete one-hot labels. + """ + + def setUp(self): + """ + ut + """ + self.op_type = "teacher_student_sigmoid_loss" + batch_size = 16 + num_classes = 1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.uniform(0, 2, (batch_size, num_classes)) + .astype("float32") + } + outs = [] + for index, label in enumerate(self.inputs["Label"]): + x = self.inputs["X"][index] + if label < -1.0: + outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x)))) + elif label < 0.0: + outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x)))) + elif label < 1.0: + outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \ + max(x, 0.0) - x * label + log(1.0 + exp(-abs(x)))) + #print "33 python x:", x, "python label:", label, "term1:", max(x, 0.0) + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))) + else: + outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \ + max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x)))) + #print "44 python x:", x, "python label:", label, "term1:", max(x, 0.0) - x + log(1.0 + exp(-abs(x))), "term2:", max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))) + self.outputs = {'Y': np.array(outs)} + + def test_check_output(self): + """ + ut + """ + self.check_output() + + def test_check_grad(self): + """ + ut + """ + self.check_grad(["X"], "Y", numeric_grad_delta=0.005) -- GitLab From aa6e9c30becf0215870fd3633684c97a6d614263 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 19 Dec 2018 03:54:05 +0100 Subject: [PATCH 0347/2367] [MKL-DNN ]Added transpose/transpose2 Op (#14872) * - Added transpose MKLDNN Op - Few basic UT works - Added 1D transpose - implementing generic mem desc for MKLDNN transpose - Modified trnaspose op to support more dimensional data eg. 5,6..10 - Added is_test attribute to transpose op test=develop * - Added support for MKLDNN::memory::format::any for Transpose MKLDNN op test=develop * - Additional transpose mkldnn op correction to mkldnn layout test=develop * Cosmetic fixes test=develop * - Removed const_cast to obey coding standard test=develop --- paddle/fluid/operators/transpose_mkldnn_op.cc | 124 ++++++++++++++++++ paddle/fluid/operators/transpose_op.cc | 49 ++++++- .../unittests/test_transpose_mkldnn_op.py | 76 +++++++++++ .../tests/unittests/test_transpose_op.py | 13 +- 4 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/transpose_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc new file mode 100644 index 00000000000..37f1cadc7d2 --- /dev/null +++ b/paddle/fluid/operators/transpose_mkldnn_op.cc @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using framework::DataLayout; + +template +class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE( + is_test == true, + "ConvTransposeMKLDNN works only for inference!. Set is_test = True"); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + std::vector axis = ctx.Attr>("axis"); + int ndims = axis.size(); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + const T* input_data = input->data(); + + if (ndims == 1) { + output->ShareDataWith(*input); + return; + } + + std::vector nchw_axis(ndims, 0); + for (size_t i = 0; i < nchw_axis.size(); ++i) { + nchw_axis[i] = i; + } + + std::vector nchw_tz = paddle::framework::vectorize2int(input->dims()); + std::string data_format = ctx.Attr("data_format"); + + auto src_md = + input->format() != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType(), + input->format()) + : Axis2MemoryDesc(nchw_tz, nchw_axis); + + this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis), + src_md, output, input_data, nchw_tz, mkldnn_engine); + } + + protected: + mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, + std::vector& axis) const { + mkldnn_memory_desc_t mem_fmt; + + mem_fmt.primitive_kind = mkldnn_memory; + mem_fmt.ndims = axis.size(); + for (unsigned int i = 0; i < nchw_tz.size(); ++i) { + mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, + // regardless physical layout) + } + mem_fmt.data_type = mkldnn_f32; + mem_fmt.format = mkldnn_blocked; + + unsigned int total_stride = 1; + for (int i = nchw_tz.size() - 1; i >= 0; --i) { + mem_fmt.layout_desc.blocking.padding_dims[i] = + nchw_tz[i]; // logical dimensions (nchw format, regardless physical + // layout) + mem_fmt.layout_desc.blocking.block_dims[i] = 1; + mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset + mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; + mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1; + total_stride *= nchw_tz[axis[i]]; + } + mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset + return mem_fmt; + } + + void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o, + mkldnn::memory::desc md_i, Tensor* output, + const T* data_i, std::vector& nchw_dims, + const mkldnn::engine& eng) const { + // Make Memory primitive descriptors + auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng); + auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng); + + auto data_o = output->mutable_data( + place, paddle::memory::Allocator::kDefault, mpd_o.get_size()); + + auto src = mkldnn::memory(mpd_i, (T*)(data_i)); + auto dst = mkldnn::memory(mpd_o, data_o); + + auto r = mkldnn::reorder(src, dst); + mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); +REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index bc1f59bc1a7..b3b379d16ff 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -16,6 +16,10 @@ limitations under the License. */ #include #include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { @@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel { } ctx->SetOutputDim("Out", out_dims); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); + } }; class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "X", "(Tensor) The input tensor, tensors with rank up to 6 are supported."); @@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { "(vector) A list of values, and the size of the list should be " "the same with the input tensor rank. This operator permutes the input " "tensor's axes according to the values given."); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Transpose Operator. @@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); } }; diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py new file mode 100644 index 00000000000..61ac8790112 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_transpose_op import TestTransposeOp + + +class TestTransposeMKLDNN(TestTransposeOp): + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = True + self.is_test = True + return + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +class TestCase0MKLDNN(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, ) + self.axis = (0, ) + + +class TestCase1a(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (0, 2, 1) + + +class TestCase1b(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (2, 1, 0) + + +class TestCase2(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index bbcabb751f0..93be9d28da7 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -21,15 +21,24 @@ from op_test import OpTest class TestTransposeOp(OpTest): def setUp(self): + self.init_op_type() self.initTestCase() - self.op_type = "transpose2" self.inputs = {'X': np.random.random(self.shape).astype("float32")} - self.attrs = {'axis': list(self.axis)} + self.attrs = { + 'axis': list(self.axis), + 'use_mkldnn': self.use_mkldnn, + 'is_test': self.is_test, + } self.outputs = { 'XShape': np.random.random(self.shape).astype("float32"), 'Out': self.inputs['X'].transpose(self.axis) } + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = False + self.is_test = False + def test_check_output(self): self.check_output(no_check_set=['XShape']) -- GitLab From b3cf476de414a3707a00f906422ddb956db7798f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 19 Dec 2018 11:00:50 +0800 Subject: [PATCH 0348/2367] teacher student sigmoid loss test=develop --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 68243cf744f..96fa503e774 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9204,7 +9204,7 @@ def teacher_student_sigmoid_loss(input, by the previous operator. label (Variable|list): the ground truth which is a 2-D tensor with shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound Returns: -- GitLab From 4dd61e7260314faa4b9b8f5a4c5406af013d919e Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 19 Dec 2018 11:07:16 +0800 Subject: [PATCH 0349/2367] convert GetInputVarPtrs and GetOutputVarPtrs test=develop --- paddle/fluid/framework/op_desc.cc | 31 ++++++++++++++----- paddle/fluid/framework/operator.cc | 36 +++++++++++++++++++++-- paddle/fluid/framework/shape_inference.cc | 22 -------------- paddle/fluid/framework/shape_inference.h | 31 ++++++++++--------- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index dde642764fa..0a3bb586fcc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -110,6 +110,30 @@ class CompileTimeInferShapeContext : public InferShapeContext { } } + std::vector GetInputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + bool IsRuntime() const override; protected: @@ -124,8 +148,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { void SetRepeatedDims(const std::string &name, const std::vector &dims) override; - InferShapeVarPtr GetVarPtr(const std::string &name) override; - const OpDesc &op_; const BlockDesc &block_; }; @@ -696,10 +718,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( return block_.FindVarRecursive(name)->GetType(); } -InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr( - const std::string &name) { - return block_.FindVarRecursive(name); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e023d165b03..4ccef3105c6 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -691,6 +691,25 @@ class RuntimeInferShapeContext : public InferShapeContext { bool IsRuntime() const override { return true; } + // TODO(paddle-dev): Can this be template? + std::vector GetInputVarPtrs( + const std::string& name) override { + const std::vector& vars = InputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string& name) override { + const std::vector& vars = OutputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + protected: DDim GetDim(const std::string& name) const override { Variable* var = scope_.FindVar(name); @@ -733,11 +752,22 @@ class RuntimeInferShapeContext : public InferShapeContext { return ToVarType(var->Type()); } - InferShapeVarPtr GetVarPtr(const std::string& name) override { - return scope_.FindVar(name); + private: + const std::vector& InputVars(const std::string& name) const { + auto it = ctx_.inputs.find(name); + PADDLE_ENFORCE(it != ctx_.inputs.end(), + "Operator %s does not have the input %s.", op_.Type(), name); + return it->second; + } + + const std::vector& OutputVars(const std::string& name) const { + auto it = ctx_.outputs.find(name); + PADDLE_ENFORCE(it != ctx_.outputs.end(), + "Operator %s does not have the outputs %s.", op_.Type(), + name); + return it->second; } - private: const OperatorBase& op_; const Scope& scope_; const RuntimeContext& ctx_; diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc index ddff2c7c261..0a7cebcc5a2 100644 --- a/paddle/fluid/framework/shape_inference.cc +++ b/paddle/fluid/framework/shape_inference.cc @@ -76,28 +76,6 @@ void InferShapeContext::SetReaderDims(const std::string &name, return this->SetRepeatedDims(arg_names[0], dims); } -std::vector InferShapeContext::GetInputVarPtrs( - const std::string &name) { - const std::vector arg_names = Inputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - -std::vector InferShapeContext::GetOutputVarPtrs( - const std::string &name) { - const std::vector arg_names = Outputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - std::vector InferShapeContext::GetDims( const std::vector &names) const { std::vector ret; diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index d73cca121e4..543696d43b1 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -33,22 +33,24 @@ class InferShapeContext { virtual bool HasInput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0; - std::vector GetInputsVarType( + virtual std::vector GetInputsVarType( const std::string &name) const; - std::vector GetOutputsVarType( + virtual std::vector GetOutputsVarType( const std::string &name) const; virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; - std::vector GetReaderDims(const std::string &name) const; - DDim GetInputsElementDim(const std::string &name, int idx) const; + virtual DDim GetInputDim(const std::string &name) const; + virtual std::vector GetInputsDim(const std::string &name) const; + virtual std::vector GetReaderDims(const std::string &name) const; + virtual DDim GetInputsElementDim(const std::string &name, int idx) const; - void SetOutputDim(const std::string &name, const DDim &dim); - void SetOutputsDim(const std::string &name, const std::vector &dims); - void SetReaderDims(const std::string &name, const std::vector &dims); + virtual void SetOutputDim(const std::string &name, const DDim &dim); + virtual void SetOutputsDim(const std::string &name, + const std::vector &dims); + virtual void SetReaderDims(const std::string &name, + const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -67,13 +69,14 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; - std::vector GetInputVarPtrs(const std::string &name); - std::vector GetOutputVarPtrs(const std::string &name); - virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; + virtual std::vector GetInputVarPtrs( + const std::string &name) = 0; + virtual std::vector GetOutputVarPtrs( + const std::string &name) = 0; // Note: In while op, we need this to be public - void SetDims(const std::vector &names, - const std::vector &dims); + virtual void SetDims(const std::vector &names, + const std::vector &dims); protected: virtual DDim GetDim(const std::string &name) const = 0; -- GitLab From b849157e9d3584a8d4b891340706c181c542deb0 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 19 Dec 2018 11:44:48 +0800 Subject: [PATCH 0350/2367] Add size enforce (#14919) --- .../distributed/brpc_sendrecvop_utils.cc | 23 ++++++++++++++----- .../fluid/operators/distributed/grpc_serde.cc | 8 +++++++ .../operators/distributed/sendrecvop_utils.h | 9 ++++++-- .../distributed/variable_response.cc | 2 +- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc index 6fed9ba92c1..e4604db3a38 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #endif #include +#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" @@ -31,7 +32,12 @@ namespace distributed { class IOBufWriter { public: - static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + iobuf->append(reinterpret_cast(&k), 4); iobuf->append(reinterpret_cast(&vlen), 8); iobuf->append(v, vlen); @@ -87,6 +93,10 @@ class IOBufWriter { int k, const char* v, int64_t vlen, bool in_cuda_pinned, void (*destroy)(void*), void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + #ifdef PADDLE_WITH_BRPC_RDMA IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, destroy, user_data); @@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, request->set_type(::sendrecv::NCCL_ID); const ncclUniqueId& uid = var->Get(); // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(iobuf, + IOBufWriter::Append(name, iobuf, sendrecv::VariableMessage::kSerializedFieldNumber, uid.internal, NCCL_UNIQUE_ID_BYTES); return; @@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, // FIXME(gongwb): it seems that can use zero copy. if (var_is_not_stable) { IOBufWriter::Append( - iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, static_cast(payload->ptr()), payload->memory_size()); } else { if (platform::is_gpu_place(ctx.GetPlace())) { @@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, reinterpret_cast(slr->rows().data()), static_cast(rows_memory_size)); } diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 299dfe35438..a9dea9cfd2e 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "AppendZeroCopy varname:" << name + << ", vlen:" << payload->memory_size(); + } // steal reference of tensor data ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows int num_slices = 2; // only SelectedRows have rows buffer @@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 33eded0e6c0..6a87178be5d 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/data_type.h" @@ -23,9 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" - #include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( } } +template