From abfc2fe95bbb039963b217a4990aa828a45605dd Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 28 Jan 2022 14:23:32 +0800 Subject: [PATCH] [PTen]Refactor scale kernel that has selected_rows input (#39278) * refactor scale kernel that its input is selected_rows * complement upload file --- cmake/pten.cmake | 67 ++++++++++++------ paddle/fluid/operators/scale_op.h | 28 ++++---- paddle/pten/core/kernel_registry.h | 6 ++ paddle/pten/core/kernel_utils.h | 7 +- paddle/pten/kernels/cpu/scale_kernel.cc | 2 +- paddle/pten/kernels/gpu/scale_kernel.cu | 2 +- paddle/pten/kernels/scale_kernel.h | 9 +++ .../kernels/selected_rows/scale_kernel.cc | 68 +++++++++++++++++++ paddle/pten/tests/api/scale_api.h | 2 +- 9 files changed, 151 insertions(+), 40 deletions(-) create mode 100644 paddle/pten/kernels/selected_rows/scale_kernel.cc diff --git a/cmake/pten.cmake b/cmake/pten.cmake index 8e1d2339862..be265940f3f 100644 --- a/cmake/pten.cmake +++ b/cmake/pten.cmake @@ -88,6 +88,7 @@ function(kernel_library TARGET) set(cpu_srcs) set(gpu_srcs) set(xpu_srcs) + set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) set(kernel_deps) @@ -106,6 +107,9 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc) + list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc) + endif() if (WITH_GPU OR WITH_ROCM) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) @@ -144,27 +148,30 @@ function(kernel_library TARGET) list(LENGTH cpu_srcs cpu_srcs_len) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) + list(LENGTH selected_rows_srcs selected_rows_srcs_len) # Build Target according different src organization if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0) - # If the common_srcs depends on specific device srcs, build target using this rule. + ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR + ${selected_rows_srcs_len} GREATER 0)) + # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. if (WITH_GPU) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() elseif (WITH_ROCM) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() endif() + # If there are only specific device srcs, build target using this rule. elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) if (WITH_GPU) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) @@ -179,25 +186,42 @@ function(kernel_library TARGET) cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() endif() - else() - if (${common_srcs_len} EQUAL 0) - message(FATAL_ERROR "Cannot find any implementation for ${TARGET}") + # If the selected_rows_srcs depends on common_srcs, build target using this rule. + elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + if (WITH_GPU) + nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + elseif (WITH_ROCM) + hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) else() - # If the kernel has a device independent public implementation, - # we will use this implementation and will not adopt the implementation - # under specific devices - if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - endif() + cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + endif() + # If there are only common_srcs or selected_rows_srcs, build target using below rules. + elseif (${common_srcs_len} GREATER 0) + if (WITH_GPU) + nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_ROCM) + hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + else() + cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() + elseif (${selected_rows_srcs_len} GREATER 0) + if (WITH_GPU) + nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_ROCM) + hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + else() + cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() + else() + message(FATAL_ERROR "Cannot find any implementation for ${TARGET}") endif() if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${selected_rows_srcs_len} GREATER 0) # append target into PTEN_KERNELS property get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) set(pten_kernels ${pten_kernels} ${TARGET}) @@ -219,6 +243,9 @@ function(kernel_library TARGET) if (${xpu_srcs_len} GREATER 0) kernel_declare(${xpu_srcs}) endif() + if (${selected_rows_srcs_len} GREATER 0) + kernel_declare(${selected_rows_srcs}) + endif() endfunction() function(register_kernels) diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 2a30d3f0b08..8ce0b7984cc 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -43,34 +43,36 @@ class ScaleKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in_var = ctx.InputVar("X"); - auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); auto bias = ctx.Attr("bias"); auto bias_after_scale = ctx.Attr("bias_after_scale"); - auto scale = ctx.Attr("scale"); + auto* out_var = ctx.OutputVar("Out"); + if (ctx.HasInput("ScaleTensor")) { auto* scale_tensor = ctx.Input("ScaleTensor"); scale = static_cast(GetAttrFromTensor(scale_tensor)); } - auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); - out_slr->set_rows(in_slr.rows()); - out_slr->set_height(in_slr.height()); - } + auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); auto* out = framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); out->mutable_data(in->place()); auto& dev_ctx = ctx.device_context(); // call new kernel - pten::ScaleKernel( - static_cast::TYPE&>(dev_ctx), - *in, scale, bias, bias_after_scale, out); + if (in_var->IsType()) { + pten::ScaleSR( + static_cast::TYPE&>(dev_ctx), + in_var->Get(), scale, bias, bias_after_scale, + out_var->GetMutable()); + } else { + pten::ScaleKernel( + static_cast::TYPE&>(dev_ctx), + *in, scale, bias, bias_after_scale, out); + } } }; diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index 800c01f6916..210309cad60 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -74,6 +74,9 @@ struct KernelArgsParseFunctor { std::type_index(typeid(const std::vector&))) { args_def->AppendInput( default_key.backend(), default_tensor_layout, default_key.dtype()); + } else if (arg_type == std::type_index(typeid(const SelectedRows&))) { + args_def->AppendInput( + default_key.backend(), default_tensor_layout, default_key.dtype()); } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput( default_key.backend(), default_tensor_layout, default_key.dtype()); @@ -81,6 +84,9 @@ struct KernelArgsParseFunctor { std::type_index(typeid(std::vector))) { args_def->AppendOutput( default_key.backend(), default_tensor_layout, default_key.dtype()); + } else if (arg_type == std::type_index(typeid(SelectedRows*))) { + args_def->AppendOutput( + default_key.backend(), default_tensor_layout, default_key.dtype()); } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index d48572db5a2..5d6f5376c77 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -20,6 +20,7 @@ #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_context.h" #include "paddle/pten/core/kernel_def.h" +#include "paddle/pten/core/selected_rows.h" #include "paddle/pten/core/sparse_coo_tensor.h" #include "paddle/pten/core/sparse_csr_tensor.h" @@ -215,6 +216,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); + PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); @@ -223,8 +225,6 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor); - // TODO(chenweihang): adapt SelectedRows - // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor); /* Attribute Helpers */ @@ -244,14 +244,13 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); + PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor); - // TODO(chenweihang): adapt SelectedRows - // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor); /* End case */ template diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index 4f999ac4d17..4c248e6a014 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -57,7 +57,7 @@ PT_REGISTER_KERNEL(scale, pten::ScaleKernel, float, double, - paddle::platform::bfloat16, + pten::dtype::bfloat16, uint8_t, int8_t, int16_t, diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index 5aba001267a..6cf84acd9dc 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -72,7 +72,7 @@ PT_REGISTER_KERNEL(scale, pten::ScaleKernel, float, double, - paddle::platform::float16, + pten::dtype::float16, uint8_t, int8_t, int16_t, diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h index 1cd11f0b878..7fe627b4427 100644 --- a/paddle/pten/kernels/scale_kernel.h +++ b/paddle/pten/kernels/scale_kernel.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/pten/common/scalar.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/selected_rows.h" #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" namespace pten { @@ -28,6 +29,14 @@ void ScaleKernel(const Context& dev_ctx, bool bias_after_scale, DenseTensor* out); +template +void ScaleSR(const Context& dev_ctx, + const SelectedRows& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + SelectedRows* out); + template DenseTensor Scale(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/pten/kernels/selected_rows/scale_kernel.cc b/paddle/pten/kernels/selected_rows/scale_kernel.cc new file mode 100644 index 00000000000..8b29f1d6c53 --- /dev/null +++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/kernels/scale_kernel.h" + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/pten/common/bfloat16.h" +namespace pten { + +template +void ScaleSR(const Context& dev_ctx, + const SelectedRows& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + SelectedRows* out) { + if (x.value().data() != out->value().data()) { + out->set_rows(x.rows()); + out->set_height(x.height()); + } + pten::ScaleKernel( + dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value()); +} + +} // namespace pten + +PT_REGISTER_KERNEL(scale_sr, + CPU, + ALL_LAYOUT, + pten::ScaleSR, + float, + double, + pten::dtype::bfloat16, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PT_REGISTER_KERNEL(scale_sr, + GPU, + ALL_LAYOUT, + pten::ScaleSR, + float, + double, + pten::dtype::float16, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} +#endif diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h index 0ba1d6a0e3f..4b5a0a7daf4 100644 --- a/paddle/pten/tests/api/scale_api.h +++ b/paddle/pten/tests/api/scale_api.h @@ -101,7 +101,7 @@ static void ScaleCPU(DataType kernel_dtype, break; } case pten::DataType::BFLOAT16: { - pten::ScaleKernel( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } -- GitLab