From b76ef0451029c1dd32d164074dad8177d9930e48 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 13 Dec 2021 11:04:26 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90PTen=E3=80=91Add=20variadic=20args=20k?= =?UTF-8?q?ernel=20for=20PTen=20API=20to=20replace=20KernelContext=20(#379?= =?UTF-8?q?42)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add variadic_args kernel in pten * merge develop code * add variadic_args kernel and benchmark * change dynamic_cast to static_cast for DeviceContext * merge the code * modify code format * refactor variadic kernel function --- paddle/pten/api/include/kernel_signature.h | 111 +++++++ paddle/pten/core/kernel_factory.h | 10 +- paddle/pten/core/kernel_registry.h | 68 +++-- paddle/pten/core/kernel_utils.h | 23 +- paddle/pten/tests/api/CMakeLists.txt | 1 + paddle/pten/tests/api/scale_api.h | 282 ++++++++++++++++++ paddle/pten/tests/api/test_scale_benchmark.cc | 62 ++++ python/paddle/utils/code_gen/api.yaml | 2 +- python/paddle/utils/code_gen/api_gen.py | 58 ++-- 9 files changed, 557 insertions(+), 60 deletions(-) create mode 100644 paddle/pten/api/include/kernel_signature.h create mode 100644 paddle/pten/tests/api/scale_api.h create mode 100644 paddle/pten/tests/api/test_scale_benchmark.cc diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h new file mode 100644 index 00000000000..1ff91f7e94a --- /dev/null +++ b/paddle/pten/api/include/kernel_signature.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/common/scalar_array.h" +#include "paddle/pten/core/dense_tensor.h" + +// This header is used to cast kernel function from void* to original form of +// function Currnetly. +// It may be generated automatically in the future. + +namespace pten { + +using DeviceContext = paddle::platform::DeviceContext; + +using add_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + int, + DenseTensor*); + +using cast_kernel = void (*)( + const DeviceContext&, const DenseTensor&, DataType, DataType, DenseTensor*); + +using divide_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + int, + DenseTensor*); + +using dot_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + DenseTensor*); + +using flatten_kernel = + void (*)(const DeviceContext&, const DenseTensor&, int, int, DenseTensor*); + +using full_kernel = void (*)(const DeviceContext&, + const ScalarArray&, + const Scalar&, + DenseTensor*); + +using full_like_kernel = void (*)(const DeviceContext&, + const Scalar&, + DenseTensor*); + +using matmul_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + bool, + bool, + DenseTensor*); + +using mean_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const std::vector&, + bool, + bool, + DataType, + DataType, + DenseTensor*); + +using multiply_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + int, + DenseTensor*); + +using reshape_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const std::vector&, + DenseTensor*); + +using scale_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const Scalar&, + float, + bool, + DenseTensor*); + +using sum_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const std::vector&, + bool, + bool, + DataType, + DataType, + DenseTensor*); + +using subtract_kernel = void (*)(const DeviceContext&, + const DenseTensor&, + const DenseTensor&, + int, + DenseTensor*); + +} // namespace pten diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h index 4adfb703503..e61143bf142 100644 --- a/paddle/pten/core/kernel_factory.h +++ b/paddle/pten/core/kernel_factory.h @@ -228,10 +228,17 @@ class Kernel { // for map element contruct Kernel() = default; - explicit Kernel(KernelFn fn) : fn_(fn) {} + explicit Kernel(KernelFn fn, void* variadic_fn) + : fn_(fn), variadic_fn_(variadic_fn) {} void operator()(KernelContext* ctx) const { fn_(ctx); } + template + Fn GetVariadicKernelFn() const { + auto* func = reinterpret_cast(variadic_fn_); + return func; + } + KernelArgsDef* mutable_args_def() { return &args_def_; } const KernelArgsDef& args_def() const { return args_def_; } @@ -244,6 +251,7 @@ class Kernel { private: KernelFn fn_{nullptr}; + void* variadic_fn_ = nullptr; KernelArgsDef args_def_; }; diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index 645e77fc60f..83ee8fd94b6 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -101,14 +101,16 @@ struct KernelRegistrar { DataType dtype, KernelArgsParseFn args_parse_fn, KernelArgsDefFn args_def_fn, - KernelFn kernel_fn) { + KernelFn kernel_fn, + void* variadic_kernel_fn) { ConstructKernel(kernel_name_cstr, backend, layout, dtype, args_parse_fn, args_def_fn, - kernel_fn); + kernel_fn, + variadic_kernel_fn); } KernelRegistrar(const char* kernel_name_cstr, @@ -116,7 +118,8 @@ struct KernelRegistrar { DataLayout layout, KernelArgsParseFn args_parse_fn, KernelArgsDefFn args_def_fn, - KernelFn kernel_fn) { + KernelFn kernel_fn, + void* variadic_kernel_fn) { for (size_t dtype = static_cast(DataType::BOOL); dtype != static_cast(DataType::NUM_DATA_TYPES); dtype++) { @@ -126,7 +129,8 @@ struct KernelRegistrar { static_cast(dtype), args_parse_fn, args_def_fn, - kernel_fn); + kernel_fn, + variadic_kernel_fn); } } @@ -137,10 +141,11 @@ struct KernelRegistrar { DataType dtype, KernelArgsParseFn args_parse_fn, KernelArgsDefFn args_def_fn, - KernelFn kernel_fn) { + KernelFn kernel_fn, + void* variadic_kernel_fn) { KernelName kernel_name(kernel_name_cstr); KernelKey kernel_key(backend, layout, dtype); - Kernel kernel(kernel_fn); + Kernel kernel(kernel_fn, variadic_kernel_fn); args_parse_fn(kernel_key, kernel.mutable_args_def()); args_def_fn(&kernel); KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; @@ -356,7 +361,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ registrar_id, \ @@ -375,7 +381,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ PT_ID, \ backend, \ @@ -400,7 +407,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ PT_ID, \ backend, \ @@ -425,7 +433,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ PT_ID, \ backend, \ @@ -450,7 +459,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ PT_ID, \ backend, \ @@ -475,7 +485,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ PT_ID, \ backend, \ @@ -500,7 +511,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ PT_ID, \ backend, \ @@ -525,7 +537,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ PT_ID, \ backend, \ @@ -550,7 +563,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ PT_ID, \ backend, \ @@ -575,7 +589,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ PT_ID, \ backend, \ @@ -600,7 +615,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ PT_ID, \ backend, \ @@ -625,7 +641,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ PT_ID, \ backend, \ @@ -650,7 +667,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ PT_ID, \ backend, \ @@ -675,7 +693,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ PT_ID, \ backend, \ @@ -700,7 +719,8 @@ struct KernelRegistrar { ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ PT_ID, \ backend, \ @@ -728,7 +748,8 @@ struct KernelRegistrar { DATATYPE(dtype), \ ::pten::KernelArgsParseFunctor::Parse, \ args_def_fn, \ - PT_KERNEL(kernel_fn)); \ + PT_KERNEL(kernel_fn), \ + PT_VARIADIC_KERNEL(kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \ void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*) @@ -750,7 +771,8 @@ struct KernelRegistrar { DATALAYOUT(layout), \ ::pten::KernelArgsParseFunctor::Parse, \ &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name, \ - PT_KERNEL(kernel_fn)); \ + PT_KERNEL(kernel_fn), \ + PT_VARIADIC_KERNEL(kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \ void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel) diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index dcfc8c55644..ad7387e1529 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -44,6 +44,10 @@ using XPUContext = paddle::platform::XPUDeviceContext; #define PT_KERNEL(...) \ ::pten::KernelImpl::Compute +#define PT_VARIADIC_KERNEL(...) \ + reinterpret_cast(&::pten::KernelImpl::VariadicCompute) + #define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx) \ template \ struct KernelCallHelper { \ @@ -169,10 +173,19 @@ struct TypeTag {}; template struct KernelImpl; -template -struct KernelImpl { +template +struct KernelImpl { static void Compute(KernelContext* ctx) { - KernelCallHelper>::template Compute<0, 0, 0, 0>(ctx); + KernelCallHelper>::template Compute<0, 0, 0, 0>(ctx); + } + + static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) { + return kernel_fn(static_cast(dev_ctx), std::forward(args)...); } private: @@ -224,12 +237,12 @@ struct KernelImpl { template struct KernelCallHelper> { template - static void Compute(KernelContext* ctx, Args&... args) { + static void Compute(KernelContext* ctx, DevCtx dev_ctx, Args&... args) { static_assert(dev_ctx_idx > 0, "Kernel should pass DeviceContext as argument."); static_assert(out_idx > 0, "Kernel should have output argument."); // TODO(chenweihang): check dev_ctx, in, attr, out number - return kernel_fn(args...); + return kernel_fn(dev_ctx, args...); } }; }; diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt index 46f2ef8be7c..88faa773dfa 100644 --- a/paddle/pten/tests/api/CMakeLists.txt +++ b/paddle/pten/tests/api/CMakeLists.txt @@ -21,3 +21,4 @@ cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils) cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils) cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils) +cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils) diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h new file mode 100644 index 00000000000..565bb0f139d --- /dev/null +++ b/paddle/pten/tests/api/scale_api.h @@ -0,0 +1,282 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "glog/logging.h" + +#include "paddle/pten/api/include/tensor.h" +#include "paddle/pten/api/lib/api_registry.h" +#include "paddle/pten/api/lib/kernel_dispatch.h" +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/common/scalar_array.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/include/core.h" +#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/kernels/cpu/math.h" +#include "paddle/pten/kernels/cuda/math.h" + +namespace paddle { +namespace experimental { + +PADDLE_API Tensor scale_kernel_context(const Tensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError( + "scale", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "scale API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << "scale API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + auto kernel_context = pten::KernelContext(dev_ctx); + + auto dense_x = std::dynamic_pointer_cast(x.impl()); + kernel_context.EmplaceBackInput(dense_x); + + kernel_context.EmplaceBackAttr(pten::Scalar(scale)); + kernel_context.EmplaceBackAttr(bias); + kernel_context.EmplaceBackAttr(bias_after_scale); + + auto out_meta = pten::UnchangedInferMeta(dense_x->meta()); + + const auto allocator = + std::make_shared( + pten::TransToFluidPlace(kernel_backend)); + auto dense_out = std::make_shared(allocator, out_meta); + kernel_context.EmplaceBackOutput(dense_out); + + Tensor out; + out.set_impl(dense_out); + + kernel(&kernel_context); + return out; +} + +static void ScaleCPU(DataType kernel_dtype, + const pten::CPUContext& dev_ctx, + const pten::DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + pten::DenseTensor* dense_out) { + switch (kernel_dtype) { + case pten::DataType::FLOAT64: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::FLOAT32: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::BFLOAT16: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT64: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT32: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT16: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT8: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::UINT8: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + default: { + PADDLE_THROW(paddle::platform::errors::Fatal( + "Detected unsupported data type." + "Only Float64, Float32, BFloat16, Int64, Int32, Int16, Int8, UInt8 " + "are supported for now.")); + break; + } + } +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +static void ScaleCUDA(DataType kernel_dtype, + const pten::CUDAContext& dev_ctx, + const pten::DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + pten::DenseTensor* dense_out) { + switch (kernel_dtype) { + case pten::DataType::FLOAT64: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::FLOAT32: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::FLOAT16: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT64: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT32: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT16: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::INT8: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + case pten::DataType::UINT8: { + pten::Scale( + dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); + break; + } + default: { + PADDLE_THROW(paddle::platform::errors::Fatal( + "Detected unsupported data type." + "Only Float64, Float32, Float16, Int64, Int32, Int16, Int8, UInt8 " + "are " + "supported for now.")); + break; + } + } +} +#endif + +Tensor scale_switch_case(const Tensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError( + "scale", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "scale API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << "scale API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto dense_x = std::dynamic_pointer_cast(x.impl()); + + auto out_meta = pten::UnchangedInferMeta(dense_x->meta()); + const auto allocator = + std::make_shared( + pten::TransToFluidPlace(kernel_backend)); + auto dense_out = std::make_shared(allocator, out_meta); + + Tensor out; + out.set_impl(dense_out); + + switch (kernel_backend) { + case Backend::CPU: + ScaleCPU(kernel_data_type, + static_cast(*dev_ctx), + *dense_x, + scale, + bias, + bias_after_scale, + dense_out.get()); + break; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + case Backend::CUDA: + ScaleCUDA(kernel_data_type, + static_cast(*dev_ctx), + *dense_x, + scale, + bias, + bias_after_scale, + dense_out.get()); + break; +#endif + default: + PADDLE_THROW(paddle::platform::errors::Fatal( + "Detected unsupported backend." + "Only CPU and CUDA Backend are supported for now." + "Please double check if your backend falls into the above two " + "categories.")); + } + + return out; +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/pten/tests/api/test_scale_benchmark.cc b/paddle/pten/tests/api/test_scale_benchmark.cc new file mode 100644 index 00000000000..a873fd90736 --- /dev/null +++ b/paddle/pten/tests/api/test_scale_benchmark.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/pten/api/include/api.h" + +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/tests/api/scale_api.h" +#include "paddle/pten/tests/core/timer.h" + +namespace paddle { +namespace tests { + +TEST(API, scale) { + auto x = experimental::full( + {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU); + + const size_t cycles = 300; + pten::tests::Timer timer; + double t1{}, t2{}, t3{}; + + for (size_t i = 0; i < cycles; ++i) { + timer.tic(); + for (size_t i = 0; i < cycles; ++i) { + auto out = experimental::scale_kernel_context(x, 2.0, 1.0, true); + } + t1 += timer.toc(); + + timer.tic(); + for (size_t i = 0; i < cycles; ++i) { + auto out = experimental::scale(x, 2.0, 1.0, true); + } + t2 += timer.toc(); + + timer.tic(); + for (size_t i = 0; i < cycles; ++i) { + auto out = experimental::scale_switch_case(x, 2.0, 1.0, true); + } + t3 += timer.toc(); + } + + LOG(INFO) << "The cost of kernel_context is " << t1 << "ms."; + LOG(INFO) << "The cost of variadic_args_kernel_fn is " << t2 << "ms."; + LOG(INFO) << "The cost of switch_case is " << t3 << "ms."; +} + +} // namespace tests +} // namespace paddle diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 2c47bbe4566..0625000cb88 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -65,7 +65,7 @@ param : [x, dtype, layout] kernel : func : full_like - param : [x, value] + param : [value] data_type : dtype > x backend : place > x layout : layout > x diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index ed3bb1dc5f1..c7e04301ca5 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -263,60 +263,57 @@ PADDLE_API {self.output} {self.api}({self.args['args_declare']}); auto out_meta = pten::{infer_meta['func']}({param_code}); """ - def gene_kernel_context(self, input_names, attrs, infer_meta, kernel_param): + def get_kernel_args(self, input_names, attrs, kernel_param): + input_tensor_code = "" + for input_name in input_names: + # set input code + input_tensor_code = input_tensor_code + f""" + auto {self.prefix_tensor_name}{input_name} = std::dynamic_pointer_cast({input_name}.impl());""" + attr_names = attrs['names'] if kernel_param is None: kernel_param = input_names + attr_names - input_code_str = "" - attr_code_str = "" + kernel_args = "*dev_ctx, " for param in kernel_param: if param in input_names: - # set input for kernel_context - input_code_str = input_code_str + f""" - auto {self.prefix_tensor_name}{param} = std::dynamic_pointer_cast({param}.impl()); - kernel_context.EmplaceBackInput({self.prefix_tensor_name}{param});""" - + kernel_args = kernel_args + "*" + self.prefix_tensor_name + param + ", " elif param in attr_names: # set attr for kernel_context if 'ScalarArray' in attrs['attr_info'][param][0]: param = 'pten::ScalarArray(' + param + ')' elif 'Scalar' in attrs['attr_info'][param][0]: param = 'pten::Scalar(' + param + ')' - attr_code_str = attr_code_str + f""" - kernel_context.EmplaceBackAttr({param});""" - + kernel_args = kernel_args + param + ", " elif isinstance(param, bool): - attr_code_str = attr_code_str + f""" - kernel_context.EmplaceBackAttr({str(param).lower()});""" - + kernel_args = kernel_args + str(param).lower() + ", " else: - attr_code_str = attr_code_str + f""" - kernel_context.EmplaceBackAttr({param});""" + kernel_args = kernel_args + str(param) + ", " + return input_tensor_code, kernel_args[:-2] + + def gene_api_code(self): + if self.is_base_api: + input_tensors, kernel_args = self.get_kernel_args( + self.args['inputs']['names'], self.args['attrs'], + self.kernel['param']) + return f""" +PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{ +{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)} - return f""" auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - auto kernel_context = pten::KernelContext(dev_ctx); -{input_code_str} -{attr_code_str} -{self.gene_infer_meta(input_names, attr_names, infer_meta)} +{input_tensors} +{self.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)} const auto allocator = std::make_shared( pten::TransToFluidPlace(kernel_backend)); auto dense_out = std::make_shared(allocator, out_meta); - kernel_context.EmplaceBackOutput(dense_out); Tensor out; - out.set_impl(dense_out);""" + out.set_impl(dense_out); - def gene_api_code(self): - if self.is_base_api: - return f""" -PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{ -{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)} -{self.gene_kernel_context(self.args['inputs']['names'], self.args['attrs'], self.infer_meta, self.kernel['param'])} + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)({kernel_args}, dense_out.get()); - kernel(&kernel_context); return out; }} """ @@ -344,6 +341,7 @@ def source_include(header_file_path): #include "glog/logging.h" +#include "paddle/pten/api/include/kernel_signature.h" #include "paddle/pten/api/lib/api_registry.h" #include "paddle/pten/api/lib/kernel_declare.h" #include "paddle/pten/api/lib/kernel_dispatch.h" -- GitLab