From 41eb259500445639c13da6991bfea4e559c23dd5 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 8 Feb 2022 12:21:27 +0800 Subject: [PATCH] [PTen] Support SelectedRows in execution and remove scale OpKernel and InferShape (#39351) * adapt selectedrows in execution * impl selected rows branch * support selectedrow in infershape utils * fix device compile failed * fix new exe test failed * revert some changes --- .../performance_tests/benchmark_eager_cpu.cc | 2 +- .../performance_tests/benchmark_eager_cuda.cc | 2 +- .../performance_tests/benchmark_fluid_cpu.cc | 2 +- .../performance_tests/benchmark_fluid_cuda.cc | 2 +- .../framework/heter_pipeline_trainer_test.cc | 2 +- paddle/fluid/framework/infershape_utils.cc | 86 +++++++++++++++---- ...est_reference_count_pass_last_lived_ops.cc | 2 +- .../framework/new_executor/interpretercore.cc | 3 +- paddle/fluid/framework/operator.cc | 24 +++--- paddle/fluid/framework/pten_utils.cc | 47 +++++++--- paddle/fluid/framework/pten_utils.h | 2 +- paddle/fluid/imperative/prepared_operator.h | 29 +++++-- .../pscore/heter_listen_and_server_test.cc | 2 +- .../operators/pscore/heter_server_test.cc | 2 +- .../pscore/send_and_recv_op_cpu_test.cc | 2 +- .../pscore/send_and_recv_op_gpu_test.cc | 2 +- paddle/fluid/operators/scale_op.cc | 70 ++------------- paddle/fluid/operators/scale_op.h | 80 ----------------- paddle/fluid/operators/scale_op_mlu.cc | 2 +- paddle/fluid/operators/scale_op_npu.cc | 15 +++- paddle/fluid/operators/scale_op_xpu.cc | 2 +- paddle/pten/api/lib/utils/tensor_utils.cc | 23 +++-- paddle/pten/api/lib/utils/tensor_utils.h | 4 +- paddle/pten/core/kernel_context.cc | 15 ---- paddle/pten/core/kernel_context.h | 25 +----- paddle/pten/ops/compat/scale_sig.cc | 44 +++++++++- 26 files changed, 243 insertions(+), 248 deletions(-) delete mode 100644 paddle/fluid/operators/scale_op.h diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 93cd0d1338..d71d78b5d9 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -175,7 +175,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { } } -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 2df44bfcab..640ee0152e 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { } } -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index b2a96468ec..c2f0479460 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -211,7 +211,7 @@ TEST(Benchmark, FluidMLPCPU) { } // namespace imperative } // namespace paddle -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index 7f8b845b07..250005e311 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -245,7 +245,7 @@ TEST(Benchmark, FluidMLPCUDA) { } // namespace imperative } // namespace paddle -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); diff --git a/paddle/fluid/framework/heter_pipeline_trainer_test.cc b/paddle/fluid/framework/heter_pipeline_trainer_test.cc index 417c7685bc..a605d5d681 100644 --- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc @@ -26,7 +26,7 @@ #define _LINUX #endif -USE_OP(scale); +USE_OP_ITSELF(scale); USE_NO_KERNEL_OP(heter_listen_and_serv); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index e1d7190a9e..652286ab26 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -78,7 +78,6 @@ class InferShapeArgumentMappingContext : public pten::ArgumentMappingContext { const InferShapeContext& ctx_; }; -// TODO(chenweihang): Support SelectedRows later // TODO(chenweihang): Support TensorArray later class CompatMetaTensor : public pten::MetaTensor { public: @@ -104,7 +103,14 @@ class CompatMetaTensor : public pten::MetaTensor { DDim dims() const override { if (is_runtime_) { auto* var = BOOST_GET_CONST(Variable*, var_); - return var->Get().dims(); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().dims(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can get dims from DenseTensor or SelectedRows.")); + } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); return make_ddim(var->GetShape()); @@ -114,7 +120,14 @@ class CompatMetaTensor : public pten::MetaTensor { pten::DataType dtype() const override { if (is_runtime_) { auto* var = BOOST_GET_CONST(Variable*, var_); - return var->Get().dtype(); + if (var->IsType()) { + return var->Get().dtype(); + } else if (var->IsType()) { + return var->Get().dtype(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can get dtype from DenseTensor or SelectedRows.")); + } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); return pten::TransToPtenDataType(var->GetDataType()); @@ -135,10 +148,16 @@ class CompatMetaTensor : public pten::MetaTensor { void set_dims(const DDim& dims) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - pten::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->dims = dims; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; + } else if (var->IsType()) { + auto* tensor = var->GetMutable()->mutable_value(); + pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can set dims from DenseTensor or SelectedRows.")); + } } else { auto* var = BOOST_GET(VarDesc*, var_); var->SetShape(vectorize(dims)); @@ -148,10 +167,16 @@ class CompatMetaTensor : public pten::MetaTensor { void set_dtype(pten::DataType dtype) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - pten::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->dtype = dtype; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; + } else if (var->IsType()) { + auto* tensor = var->GetMutable()->mutable_value(); + pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can set dtype from DenseTensor or SelectedRows.")); + } } else { auto* var = BOOST_GET(VarDesc*, var_); var->SetDataType(pten::TransToProtoVarType(dtype)); @@ -174,11 +199,14 @@ class CompatMetaTensor : public pten::MetaTensor { void share_lod(const MetaTensor& meta_tensor) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - pten::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->lod = - static_cast(meta_tensor).GetRuntimeLoD(); + if (var->IsType()) { + auto* tensor = var->GetMutable(); + pten::DenseTensorUtils::GetMutableMeta(tensor)->lod = + static_cast(meta_tensor).GetRuntimeLoD(); + } else { + // NOTE(chenweihang): do nothing + // only LoDTensor need to share lod + } } else { auto* var = BOOST_GET(VarDesc*, var_); var->SetLoDLevel(static_cast(meta_tensor) @@ -191,7 +219,21 @@ class CompatMetaTensor : public pten::MetaTensor { set_dtype(meta_tensor.dtype()); // VarDesc doesn't contains layout, so we cannot share layout // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor share_lod(meta_tensor); + + // special case 2: share height and rows of SelectedRows in runtime + if (is_runtime_) { + auto* var = BOOST_GET(Variable*, var_); + if (var->IsType()) { + auto* selected_rows = var->GetMutable(); + auto& input_selected_rows = + static_cast(meta_tensor).GetSelectedRows(); + selected_rows->set_rows(input_selected_rows.rows()); + selected_rows->set_height(input_selected_rows.height()); + } + } } private: @@ -199,11 +241,23 @@ class CompatMetaTensor : public pten::MetaTensor { auto* var = BOOST_GET_CONST(Variable*, var_); return var->Get().lod(); } + int32_t GetCompileTimeLoD() const { auto* var = BOOST_GET_CONST(VarDesc*, var_); return var->GetLoDLevel(); } + const pten::SelectedRows& GetSelectedRows() const { + PADDLE_ENFORCE_EQ(is_runtime_, true, + platform::errors::Unavailable( + "Only can get Tensor from MetaTensor in rumtime.")); + auto* var = BOOST_GET_CONST(Variable*, var_); + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::Unavailable( + "The Tensor in MetaTensor is not SelectedRows.")); + return var->Get(); + } + InferShapeVarPtr var_; bool is_runtime_; }; diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index f410171f99..746d90cef9 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -21,7 +21,7 @@ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(elementwise_mul); USE_OP(elementwise_add); USE_OP(elementwise_add_grad); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index ef9c5b9213..53cc741d25 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -393,7 +393,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { platform::RecordEvent infershape_event("InferShape"); // If it is OperatorBase, InferShape do nothing. if (op_with_kernel != nullptr) - op_with_kernel->InferShape(instr_node.InnerInferShapeContext().get()); + op_with_kernel->Info().infer_shape_( + instr_node.InnerInferShapeContext().get()); } if (op_with_kernel != nullptr && diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 50c315bf03..5ab14a1dab 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1998,16 +1998,17 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t end_idx = start_idx + ins_vector.size(); for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - const framework::Tensor* tensor_in = nullptr; + const pten::TensorBase* tensor_in = nullptr; auto* var = ins_vector[offset]; - if (var->IsType()) { - tensor_in = &(var->Get()); + if (var->IsType()) { + tensor_in = &(var->Get()); + } else if (var->IsType()) { + tensor_in = &(var->Get()); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); - } // TODO(zyfncg): Add support for SelectedRows - + } pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); @@ -2021,17 +2022,20 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t end_idx = start_idx + outs_vector.size(); for (size_t offset = 0; offset < outs_vector.size(); ++offset) { - framework::Tensor* tensor_out = nullptr; + pten::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); - } // TODO(zyfncg): Add support for SelectedRows + } - experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i)); + experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, + output_defs.at(i)); SetAllocationForOutputTenosr( tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend)); diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 1a27f971fa..265bd99593 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -207,21 +207,40 @@ void InitDefaultKernelSignatureMap() { }); } -void SetAllocationForOutputTenosr(pten::DenseTensor* tensor, +static void SetAllocationForUninitializedDenseTensor( + pten::DenseTensor* dense_tensor, const platform::Place& place) { + int dtype_size = dense_tensor->dtype() == DataType::UNDEFINED + ? 0 + : experimental::SizeOf(dense_tensor->dtype()); + int64_t numels = product(dense_tensor->dims()); + numels = numels < 0 ? 0 : numels; + auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size); + auto& deleter = tmp_allocation_ptr.get_deleter(); + auto* allocation_ptr = tmp_allocation_ptr.release(); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); + + dense_tensor->ResetHolder(shared_allocation); +} + +void SetAllocationForOutputTenosr(pten::TensorBase* tensor, const platform::Place& place) { - if (!tensor->IsInitialized() || !(tensor->place() == place)) { - int dtype_size = tensor->dtype() == DataType::UNDEFINED - ? 0 - : experimental::SizeOf(tensor->dtype()); - int64_t numels = product(tensor->dims()); - numels = numels < 0 ? 0 : numels; - auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size); - auto& deleter = tmp_allocation_ptr.get_deleter(); - auto* allocation_ptr = tmp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - tensor->ResetHolder(shared_allocation); + if (pten::DenseTensor::classof(tensor)) { + auto* dense_tensor = static_cast(tensor); + if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) { + SetAllocationForUninitializedDenseTensor(dense_tensor, place); + } + } else if (pten::SelectedRows::classof(tensor)) { + auto* selected_rows = static_cast(tensor); + if (!selected_rows->value().IsInitialized() || + !(selected_rows->place() == place)) { + SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(), + place); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported tensor type is received when setting allocation for " + "output tensor.")); } } diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h index ae0388079d..44f5ee9f9d 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/pten_utils.h @@ -63,7 +63,7 @@ class KernelArgsNameMaker { void InitDefaultKernelSignatureMap(); -void SetAllocationForOutputTenosr(pten::DenseTensor* tensor, +void SetAllocationForOutputTenosr(pten::TensorBase* tensor, const platform::Place& place); // TODO(Wilber): support others device context. diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 719036742d..8775f715bf 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -29,6 +29,9 @@ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/var_helper.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/selected_rows.h" + DECLARE_bool(use_mkldnn); namespace paddle { @@ -262,7 +265,17 @@ void BuildDygraphPtenKernelContext( size_t end_idx = start_idx + ins_vector.size(); for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - const auto* tensor_in = GetTensorFromVar(ins_vector[offset]->Var()); + const pten::TensorBase* tensor_in = nullptr; + auto& var = ins_vector[offset]->Var(); + if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported input `%s` type when call pt kernel.", + framework::ToTypeName(var.Type()))); + } kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); @@ -287,17 +300,21 @@ void BuildDygraphPtenKernelContext( kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr}); continue; } + + pten::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]->MutableVar(); - framework::Tensor* tensor_out = nullptr; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); - } // TODO(zyfncg): Add support for SelectedRows + } - experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i)); + experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, + output_defs.at(i)); framework::SetAllocationForOutputTenosr( tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend)); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index a195b8dee3..ddc6287011 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -33,7 +33,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; using VarMsg = ::paddle::distributed::VariableMessage; DECLARE_double(eager_delete_tensor_gb); -USE_OP(scale); +USE_OP_ITSELF(scale); USE_NO_KERNEL_OP(heter_listen_and_serv); framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index 7914e9d9a1..f7e8ae1c09 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -29,7 +29,7 @@ namespace distributed = paddle::distributed; using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; using VarMsg = ::paddle::distributed::VariableMessage; -USE_OP(scale); +USE_OP_ITSELF(scale); std::shared_ptr b_rpc_service; diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc index 07fe44601c..077eecb72a 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -31,7 +31,7 @@ namespace distributed = paddle::distributed; using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; using VarMsg = ::paddle::distributed::VariableMessage; -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(send_and_recv); std::shared_ptr b_rpc_service; diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 2c701bdae7..b7049019bc 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -35,7 +35,7 @@ namespace memory = paddle::memory; using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; using VarMsg = ::paddle::distributed::VariableMessage; -USE_OP(scale); +USE_OP_ITSELF(scale); USE_OP(send_and_recv); std::shared_ptr b_rpc_service2; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 912af2c85b..ccf3afe29c 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -12,49 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scale_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace framework { -class InferShapeContext; -class OpDesc; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle +#include "paddle/pten/core/infermeta_utils.h" +#include "paddle/pten/infermeta/unary.h" namespace paddle { namespace operators { class ScaleOp : public framework::OperatorWithKernel { public: - ScaleOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "scale"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "scale"); - - if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) { - auto scale = ctx->Inputs("ScaleTensor"); - PADDLE_ENFORCE_EQ(scale.size(), 1, - platform::errors::InvalidArgument( - "Input(ScaleTensor) size must be 1, " - "but received size is %d.", - scale.size())); - } - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } + using framework::OperatorWithKernel::OperatorWithKernel; framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -150,32 +120,10 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, + PT_INFER_META(pten::UnchangedInferMeta)); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - scale, ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel); - -REGISTER_OP_CUDA_KERNEL( - scale, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); + ScaleInferShapeFunctor, ops::ScaleOpVarTypeInference, + ops::ScaleOpInplaceInferer); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h deleted file mode 100644 index 8ce0b7984c..0000000000 --- a/paddle/fluid/operators/scale_op.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/pten_utils.h" - -// only can include the headers in paddle/top/api dirs -#include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/kernels/scale_kernel.h" - -namespace paddle { -namespace operators { - -template -static inline T GetAttrFromTensor(const framework::Tensor* tensor) { - const auto* tensor_data = tensor->data(); - framework::Tensor cpu_tensor; - if (platform::is_gpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), - &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -// See Note [ Why still keep the original kernel implementation? ] -template -class ScaleKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto* in_var = ctx.InputVar("X"); - - auto bias = ctx.Attr("bias"); - auto bias_after_scale = ctx.Attr("bias_after_scale"); - auto scale = ctx.Attr("scale"); - auto* out_var = ctx.OutputVar("Out"); - - if (ctx.HasInput("ScaleTensor")) { - auto* scale_tensor = ctx.Input("ScaleTensor"); - scale = static_cast(GetAttrFromTensor(scale_tensor)); - } - - auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); - auto* out = - framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); - out->mutable_data(in->place()); - auto& dev_ctx = ctx.device_context(); - - // call new kernel - if (in_var->IsType()) { - pten::ScaleSR( - static_cast::TYPE&>(dev_ctx), - in_var->Get(), scale, bias, bias_after_scale, - out_var->GetMutable()); - } else { - pten::ScaleKernel( - static_cast::TYPE&>(dev_ctx), - *in, scale, bias, bias_after_scale, out); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc index 1e1187845c..d027ac0d33 100644 --- a/paddle/fluid/operators/scale_op_mlu.cc +++ b/paddle/fluid/operators/scale_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 7d84d56c2b..807ad7509e 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -12,12 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { +template +static inline T GetAttrFromTensor(const framework::Tensor* tensor) { + const auto* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + template class ScaleNPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index 026a5dda89..2430007de0 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -14,8 +14,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/scale_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/pten/kernels/scale_kernel.h" namespace paddle { diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index 8fdfc29540..e4c20aa971 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -198,12 +198,25 @@ pten::ScalarArray MakePtenScalarArrayFromVarList( return {vector_data}; } -void ResetTensorByArgDef(pten::DenseTensor* dst, - const pten::TensorArgDef& arg_def) { +void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst, + const pten::TensorArgDef& arg_def) { VLOG(5) << "ResetTensor by TensorArgDef."; - auto* meta = pten::DenseTensorUtils::GetMutableMeta(dst); - meta->dtype = arg_def.dtype; - meta->layout = arg_def.layout; + if (pten::DenseTensor::classof(dst)) { + auto* dense_t = static_cast(dst); + auto* meta = pten::DenseTensorUtils::GetMutableMeta(dense_t); + meta->dtype = arg_def.dtype; + meta->layout = arg_def.layout; + } else if (pten::SelectedRows::classof(dst)) { + auto* selected_rows = static_cast(dst); + auto* meta = + pten::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value()); + meta->dtype = arg_def.dtype; + meta->layout = arg_def.layout; + } else { + PADDLE_THROW(pten::errors::Unimplemented( + "Unsupported tensor type is received when reseting tensor dtype and " + "layout by argument definition.")); + } } } // namespace experimental diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h index 1ffcc7d4d5..1e2d8b74db 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.h +++ b/paddle/pten/api/lib/utils/tensor_utils.h @@ -45,8 +45,8 @@ pten::ScalarArray MakePtenScalarArrayFromVar( pten::ScalarArray MakePtenScalarArrayFromVarList( const std::vector& variable_list); -void ResetTensorByArgDef(pten::DenseTensor* dst, - const pten::TensorArgDef& arg_def); +void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst, + const pten::TensorArgDef& arg_def); } // namespace experimental } // namespace paddle diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc index 34e9fabbe6..9002003c9f 100644 --- a/paddle/pten/core/kernel_context.cc +++ b/paddle/pten/core/kernel_context.cc @@ -48,10 +48,6 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) { outputs_.emplace_back(output); } -void KernelContext::SetOutputWithoutSetRange(int index, TensorBase* output) { - outputs_.at(index) = output; -} - void KernelContext::EmplaceBackOutputs( paddle::SmallVector outputs) { int index = outputs_.size(); @@ -103,15 +99,4 @@ const std::pair& KernelContext::OutputRangeAt(size_t idx) const { return output_range_.at(idx); } -std::pair& KernelContext::MutableInputRangeAt(size_t idx) { - return input_range_[idx]; -} - -std::pair& KernelContext::MutableOutputRangeAt(size_t idx) { - return output_range_[idx]; -} - -// Temporary method: For compatible with fluid Tensor and improve performance -// Only deal with DenseTensor now -void KernelContext::ClearData() { attrs_.clear(); } } // namespace pten diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h index 876c98e3bc..25a1f2ed9b 100644 --- a/paddle/pten/core/kernel_context.h +++ b/paddle/pten/core/kernel_context.h @@ -26,10 +26,8 @@ namespace pten { -using DeviceContext = pten::DeviceContext; - /** - * Note: KernelContext doesn't manage the life if DeviceContext and Tensor + * Note: KernelContext doesn't manage the life of DeviceContext and Tensor * * Note: KernelContext does not couple the concept of framework, * its constructor can only take the members it needs as parameters, @@ -59,17 +57,15 @@ class KernelContext { void EmplaceBackOutputs(paddle::SmallVector outputs); - void SetOutputWithoutSetRange(int index, TensorBase* output); - void EmplaceBackAttr(paddle::any attr); const std::pair& InputRangeAt(size_t idx) const; const std::pair& OutputRangeAt(size_t idx) const; - std::pair& MutableInputRangeAt(size_t idx); + void AssignInputRange(std::pair&& range, size_t idx); - std::pair& MutableOutputRangeAt(size_t idx); + void AssignOutputRange(std::pair&& range, size_t idx); template const TensorType& InputAt(size_t idx) const { @@ -90,15 +86,11 @@ class KernelContext { for (size_t i = start; i < end; ++i) { auto t = static_cast(inputs_.at(i)); v.emplace_back(*t); - inputs_.at(i) = nullptr; + inputs_[i] = nullptr; } return v; } - void AssignInputRange(std::pair&& range, size_t idx); - - void AssignOutputRange(std::pair&& range, size_t idx); - template TensorType* MutableOutputAt(size_t idx) { return static_cast(outputs_.at(idx)); @@ -110,7 +102,6 @@ class KernelContext { for (size_t i = start; i < end; ++i) { v.emplace_back(static_cast(outputs_.at(i))); } - return v; } @@ -124,25 +115,17 @@ class KernelContext { } } - // Temporary method: For compatible with fluid Tensor and improve performance - // Only deal with DenseTensor now - void ClearData(); - size_t InputsSize() const { return inputs_.size(); } size_t OutputsSize() const { return outputs_.size(); } size_t AttrsSize() const { return attrs_.size(); } private: - // DeviceContext base class DeviceContext* dev_ctx_; - // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope` - // Note: can't use API Tensor here, the inference don't use this API Tensor paddle::SmallVector inputs_; paddle::SmallVector outputs_; paddle::SmallVector attrs_; - // Only contains input like list[Tensor] need `range` paddle::SmallVector> input_range_; paddle::SmallVector> output_range_; }; diff --git a/paddle/pten/ops/compat/scale_sig.cc b/paddle/pten/ops/compat/scale_sig.cc index 5ce159a5d8..279be3df54 100644 --- a/paddle/pten/ops/compat/scale_sig.cc +++ b/paddle/pten/ops/compat/scale_sig.cc @@ -16,9 +16,37 @@ limitations under the License. */ namespace pten { +/** + * Note [ Why does the ArgumentMapping function need to be so complicated? ] + * + * In order to meet the requirements of infrt, the function used to match Op + * and Kernel parameters, need to be placed in pten as a compatible component, + * and does not depend on fluid. + * + * Because infrt not only needs to dynamically call this argument mapping + * function at runtime, but also needs to statically declare all possible + * results of the function before running without any information. + * + * The infrt declare like: + * + * def PDKEL_Reshape_to_CPU : Pat< + * (PD_ReshapeOp $x, $shape_tensor, $shape_attr), // OpMaker arguements + * (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>; // Kernel arguments + * def PDKEL_Reshape_to_CPU : Pat< + * (PD_ReshapeOp $x, $shape_tensor, $shape_attr), + * (PDKEL_ReshapeKernelAttr $x, fn($shape_tensor)>; + * + * Therefore, we need to write out each result of the argument mapping function, + * like `KernelSignature("full", {}, {"ShapeTensor", "value"}, {"Out"})`, it + * cannot contains variable, only can contains const char* string. + * + * Infrt will parse all results before running for the generation of the above + * static declare, which leads to some functions being written in a long way, + * and the complicated ones may have hundreds of lines, which has certain side + * effects on the programming experience. + */ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("X")) { - std::string scale_attr; if (ctx.HasInput("ScaleTensor")) { return KernelSignature( "scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); @@ -26,9 +54,19 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature( "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); } + } else if (ctx.IsSelectedRowsInput("X")) { + if (ctx.HasInput("ScaleTensor")) { + return KernelSignature("scale_sr", + {"X"}, + {"ScaleTensor", "bias", "bias_after_scale"}, + {"Out"}); + } else { + return KernelSignature( + "scale_sr", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); + } + } else { + return KernelSignature("unregistered", {}, {}, {}); } - // TODO(chenweihang): support other cases after selected rows added - return KernelSignature("scale.unregistered", {}, {}, {}); } } // namespace pten -- GitLab