From a8879215aa58a5c93c86ac78ac247b6d50bf31c1 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Thu, 27 Jan 2022 15:19:35 +0800 Subject: [PATCH] [PluggableDevice] Add custom kernel support based on pten kernel management (#38848) * [Demo] custom kernel based on pten kernel * merge and npu custom work well * del comments * delete other code * fix CUDAContext * fix not found small_vector.h * support NPU * fix NPUContext * fix DeviceContext support * add UT * fix call * add UT * fix * fix for comments and ut * add MACRO control * fix multi input output * support env CUSTOM_DEVICE_ROOT * deal with special cases * fix for Windows * try coverage with test_custom_kernel_dot.py * fix test_custom_kernel_dot * fix test_custom_kernel_dot * fix merge * fix merge * fix CI * update * merge and fix * remove WITH_CUSTOM_KERNEL * fix merge * merge and fix * fix ut * fix ut for mac * add more UT * add more UT * fix --- paddle/fluid/framework/CMakeLists.txt | 6 +- paddle/fluid/framework/custom_kernel.cc | 411 +++++++++++ paddle/fluid/framework/custom_kernel.h | 38 + paddle/fluid/framework/custom_kernel_test.cc | 283 ++++++++ .../fluid/framework/op_kernel_info_helper.h | 71 ++ paddle/fluid/inference/api/CMakeLists.txt | 11 +- paddle/fluid/platform/CMakeLists.txt | 5 +- paddle/fluid/platform/init.cc | 14 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/pten/api/all.h | 1 + paddle/pten/api/ext/op_kernel_info.h | 663 ++++++++++++++++++ paddle/pten/api/lib/CMakeLists.txt | 7 +- paddle/pten/api/lib/op_kernel_info.cc | 114 +++ paddle/pten/core/kernel_def.h | 2 +- paddle/pten/core/kernel_factory.h | 2 - paddle/testing/CMakeLists.txt | 2 +- python/paddle/fluid/core.py | 15 + python/paddle/fluid/tests/CMakeLists.txt | 1 + .../fluid/tests/custom_kernel/CMakeLists.txt | 2 + .../fluid/tests/custom_kernel/__init__.py | 13 + .../tests/custom_kernel/custom_kernel_dot.cc | 53 ++ .../custom_kernel/custom_kernel_dot_setup.py | 53 ++ .../custom_kernel/test_custom_kernel_dot.py | 62 ++ .../custom_kernel/test_custom_kernel_load.py | 80 +++ python/setup.py.in | 3 +- 25 files changed, 1897 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/framework/custom_kernel.cc create mode 100644 paddle/fluid/framework/custom_kernel.h create mode 100644 paddle/fluid/framework/custom_kernel_test.cc create mode 100644 paddle/fluid/framework/op_kernel_info_helper.h create mode 100644 paddle/pten/api/ext/op_kernel_info.h create mode 100644 paddle/pten/api/lib/op_kernel_info.cc create mode 100644 python/paddle/fluid/tests/custom_kernel/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/custom_kernel/__init__.py create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py create mode 100644 python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py create mode 100644 python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 27ba88b56f0..de3a957df08 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -437,11 +437,12 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api) - +cc_library(custom_kernel SRCS custom_kernel.cc DEPS + tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) @@ -451,3 +452,4 @@ endif() cc_test(scope_guard_test SRCS scope_guard_test.cc) cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils) +cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor) diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc new file mode 100644 index 00000000000..a5498623941 --- /dev/null +++ b/paddle/fluid/framework/custom_kernel.cc @@ -0,0 +1,411 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include "paddle/fluid/framework/custom_kernel.h" +#include +#include +#include +#include "paddle/fluid/framework/op_kernel_info_helper.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/api/ext/op_kernel_info.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/kernel_context.h" +#include "paddle/pten/core/kernel_registry.h" + +DECLARE_bool(run_pten_kernel); + +namespace paddle { + +namespace framework { + +// set pten::Kernel args_def_ from op_kernel_info +// because we can not set directly to pten::Kernel without exposing +// pten::KernelArgsDef when parsing custom user function +static void ParseArgs(const OpKernelInfo& op_kernel_info, + pten::KernelArgsDef* args_def) { + auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info); + auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info); + auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info); + + for (auto& input : input_defs) { + args_def->AppendInput(input.backend, input.layout, input.dtype); + } + for (auto& output : output_defs) { + args_def->AppendOutput(output.backend, output.layout, output.dtype); + } + for (auto& attr : attribute_defs) { + args_def->AppendAttribute(attr.type_index); + } +} + +// custom pten kernel call function define +static void RunKernelFunc(pten::KernelContext* ctx, + const OpKernelInfo& op_kernel_info) { + VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin..."; + + // input and output size is not params' num + // but actual Tensors' size + size_t input_size = ctx->InputsSize(); + size_t output_size = ctx->OutputsSize(); + size_t attr_size = ctx->AttrsSize(); + + // parameters' num of unified user kernel function + auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info); + auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info); + auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info); + + PADDLE_ENFORCE_GE(input_size, input_defs.size(), + platform::errors::InvalidArgument( + "the size of ctx inputs size (%d) must be larger than " + "the size of kernel input_defs (%d).", + input_size, input_defs.size())); + + PADDLE_ENFORCE_GE(output_size, output_defs.size(), + platform::errors::InvalidArgument( + "the size of ctx outputs size (%d) must be larger than " + "the size of kernel output_defs (%d).", + output_size, output_defs.size())); + + PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(), + platform::errors::InvalidArgument( + "the size of ctx attribute size (%d) must be equal to " + "to the size of kernel attribute_defs (%d).", + attr_size, attribute_defs.size())); + + VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size() + << "[tensor size:" << input_size << "]" + << " Attribute num: " << attribute_defs.size() + << " Output num: " << output_defs.size() + << "[tensor size:" << output_size << "]."; + + // Inputs mapping + std::vector custom_ins; + std::vector> custom_vec_ins; + for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) { + VLOG(3) << "Mapping Input[" << in_idx << "]"; + const std::pair range = ctx->InputRangeAt(in_idx); + + // is_vector tells if this Input is Tensor or std::vector + if (!input_defs.at(in_idx).is_vector) { + paddle::experimental::Tensor custom_t; + auto& ctx_tensor = ctx->InputAt(range.first); + custom_t.set_impl(std::make_shared(ctx_tensor)); + custom_ins.emplace_back(custom_t); + } else { + std::vector custom_vec_in; + auto ctx_tensor_vec = + ctx->MoveInputsBetween(range.first, range.second); + for (auto& ctx_tensor : ctx_tensor_vec) { + paddle::experimental::Tensor custom_t; + custom_t.set_impl(std::make_shared(ctx_tensor)); + custom_vec_in.emplace_back(custom_t); + } + custom_vec_ins.emplace_back(custom_vec_in); + } + VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first + << "," << range.second << ")."; + } + + // Attributes mapping + std::vector custom_attrs; + for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) { + VLOG(3) << "Mapping Attribute[" << attr_idx << "]"; + if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) { + bool arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(int))) { + int arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(float))) { + float arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(double))) { + double arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(int64_t))) { + int64_t arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(pten::dtype::float16))) { + pten::dtype::float16 arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(DataType))) { + DataType arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(const Scalar&))) { + const Scalar& arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(const std::vector&))) { + const std::vector& arg = + ctx->AttrAt&>(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(const ScalarArray&))) { + const ScalarArray& arg = ctx->AttrAt(attr_idx); + custom_attrs.emplace_back(arg); + } else if (attribute_defs[attr_idx].type_index == + std::type_index(typeid(const std::vector&))) { + const std::vector& arg = + ctx->AttrAt&>(attr_idx); + custom_attrs.emplace_back(arg); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported attribute attribute_defs[%d].type_index", attr_idx)); + } + VLOG(3) << "Mapped Attribute[" << attr_idx << "]"; + } + + // Outputs mapping + std::vector custom_outs; + std::vector> custom_vec_outs; + std::vector> custom_outs_ptr; + std::vector>> + custom_vec_outs_ptr; + + for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) { + VLOG(3) << "Mapping Output[" << out_idx << "]"; + const std::pair range = ctx->OutputRangeAt(out_idx); + + // is_vector tells if this Output is Tensor or std::vector + if (!output_defs.at(out_idx).is_vector) { + auto* ctx_tensor = ctx->MutableOutputAt(range.first); + auto* custom_t = new paddle::experimental::Tensor(); + auto custom_t_ptr = std::make_shared(*ctx_tensor); + custom_t->set_impl(custom_t_ptr); + custom_outs.emplace_back(custom_t); + custom_outs_ptr.emplace_back(custom_t_ptr); + } else { + std::vector custom_vec_out; + std::vector> custom_vec_out_ptr; + auto ctx_tensor_vec = ctx->MutableOutputBetween( + range.first, range.second); + for (auto ctx_tensor : ctx_tensor_vec) { + auto* custom_t = new paddle::experimental::Tensor(); + auto custom_t_ptr = std::make_shared(*ctx_tensor); + custom_t->set_impl(custom_t_ptr); + custom_vec_out.emplace_back(custom_t); + custom_vec_out_ptr.emplace_back(custom_t_ptr); + } + custom_vec_outs.emplace_back(custom_vec_out); + custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr); + } + VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first + << "," << range.second << ")."; + } + + // DeviceContext + // In pten, the first paramter XXContext is decided when registering + // through template param, but custom kernel function use unified + // DeviceContext as first parameter of user_kernel_fn, we use backend + // from OpKernelInfo to decide XXContext. In temporary simple + // DeviceContext, we just set necessary info to dev_ctx(such as stream + // in NPUContext), more related work should be done when + // pten::DeviceContext is exposed to outer. + DeviceContext dev_ctx; + auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info); + if (backend == pten::Backend::CPU) { + // do nothing + } else { + LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend + << " with compiled Paddle."; + return; + } + + auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info); + // call user function + user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs, + &custom_outs, &custom_vec_outs); + + VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function."; + + // NOTE: Map back the output tensors with stored shared_ptrs. + for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) { + VLOG(3) << "Mapping Back Output[" << out_idx << "]"; + const std::pair range = ctx->OutputRangeAt(out_idx); + + // is_vector tells if this Output is Tensor or std::vector + if (!output_defs.at(out_idx).is_vector) { + auto* ctx_tensor = ctx->MutableOutputAt(range.first); + *ctx_tensor = *(custom_outs_ptr.back().get()); + custom_outs_ptr.pop_back(); + } else { + auto ctx_tensor_vec = ctx->MutableOutputBetween( + range.first, range.second); + auto custom_vec_ptr_out = custom_vec_outs_ptr.back(); + for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) { + *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get()); + custom_vec_ptr_out.pop_back(); + } + custom_vec_outs_ptr.pop_back(); + } + VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first + << "," << range.second << "]."; + } + + // delete newed paddle::Tensor for outputs while calling user kernel function + for (size_t i = 0; i < custom_outs.size(); ++i) { + delete custom_outs[i]; + } + for (size_t i = 0; i < custom_vec_outs.size(); ++i) { + for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) { + delete custom_vec_outs[i][j]; + } + } +} + +void RegisterKernelWithMetaInfo( + const std::vector& op_kernel_infos) { + PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true, + platform::errors::Unimplemented( + "Custom Kernel depends on pten kernel enabled,")); + + for (size_t i = 0; i < op_kernel_infos.size(); ++i) { + auto& kernel_info = op_kernel_infos[i]; + auto op_type = OpKernelInfoHelper::GetOpName(kernel_info); + auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info); + + VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key; + + // 1.Check whether this kernel is valid for a specific operator + PADDLE_ENFORCE_EQ( + pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true, + platform::errors::InvalidArgument( + "[CUSTOM KERNEL] %s is not ready for custom kernel registering.", + op_type)); + + // 2.Check whether kernel_key has been already registed + PADDLE_ENFORCE_EQ( + pten::KernelFactory::Instance().kernels()[op_type].find(kernel_key), + pten::KernelFactory::Instance().kernels()[op_type].end(), + platform::errors::InvalidArgument( + "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been " + "already existed in Paddle, please contribute PR if need " + "to optimize the kernel code. Custom kernel do NOT support " + "to replace existing kernel in Paddle.", + op_type, kernel_key)); + + // pten::KernelFn + pten::KernelFn kernel_fn = [kernel_info](pten::KernelContext* ctx) { + VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda."; + RunKernelFunc(ctx, kernel_info); + }; + // variadic_kernel_fn + void* variadic_kernel_fn = + OpKernelInfoHelper::GetVariadicKernelFn(kernel_info); + pten::Kernel kernel(kernel_fn, variadic_kernel_fn); + // args info + ParseArgs(kernel_info, kernel.mutable_args_def()); + // register custom kernel to pten::KernelFactory + pten::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel; + VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type + << ">'s kernel " << kernel_key << " to Paddle. " + << "It will be used like native ones."; + } +} + +void RegisterKernelWithMetaInfoMap( + const paddle::OpKernelInfoMap& op_kernel_info_map) { + auto& kernel_info_map = op_kernel_info_map.GetMap(); + VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: " + << kernel_info_map.size(); + + // pair: {op_type, OpKernelInfo} + for (auto& pair : kernel_info_map) { + VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first; + RegisterKernelWithMetaInfo(pair.second); + } +} + +void LoadCustomKernelLib(const std::string& dso_lib_path) { +#ifdef _LINUX + void* dso_handle = nullptr; + int dynload_flags = RTLD_NOW | RTLD_LOCAL; + dso_handle = dlopen(dso_lib_path.c_str(), dynload_flags); + + // MUST valid dso_lib_path + PADDLE_ENFORCE_NOT_NULL( + dso_handle, + platform::errors::InvalidArgument( + "Fail to open library: %s with error: %s", dso_lib_path, dlerror())); + + typedef OpKernelInfoMap& get_op_kernel_info_map_t(); + auto* func = reinterpret_cast( + dlsym(dso_handle, "PD_GetOpKernelInfoMap")); + + if (func == nullptr) { + LOG(INFO) << "Skipped lib [" << dso_lib_path << "]: fail to find " + << "PD_GetOpKernelInfoMap symbol in this lib."; + return; + } + auto& op_kernel_info_map = func(); + RegisterKernelWithMetaInfoMap(op_kernel_info_map); + LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; +#else + VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; +#endif + return; +} + +// List all libs with given path +std::vector ListAllLib(const std::string& libs_path) { + DIR* dir = nullptr; + dir = opendir(libs_path.c_str()); + + // MUST valid libs_path + PADDLE_ENFORCE_NOT_NULL(dir, platform::errors::InvalidArgument( + "Fail to open path: %s", libs_path)); + + dirent* ptr = nullptr; + std::vector libs; + std::regex express(".*\\.so"); + std::match_results results; + while ((ptr = readdir(dir)) != nullptr) { + std::string filename(ptr->d_name); + if (std::regex_match(filename.begin(), filename.end(), results, express)) { + libs.emplace_back(libs_path + '/' + filename); + LOG(INFO) << "Found lib [" << filename << "]"; + } else { + VLOG(3) << "Skipped file [" << filename << "] without .so postfix"; + } + } + closedir(dir); + return libs; +} + +// Load custom kernels with given path +void LoadCustomKernel(const std::string& libs_path) { + VLOG(3) << "Try loading custom libs from: [" << libs_path << "]"; + std::vector libs = ListAllLib(libs_path); + for (auto& lib_path : libs) { + LoadCustomKernelLib(lib_path); + } + LOG(INFO) << "Finished in LoadCustomKernel with libs_path: [" << libs_path + << "]"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h new file mode 100644 index 00000000000..0c12bdfa8cb --- /dev/null +++ b/paddle/fluid/framework/custom_kernel.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/api/ext/op_kernel_info.h" + +namespace paddle { +namespace framework { + +// Load custom kernel lib from giwen path +void LoadCustomKernel(const std::string& libs_path); + +void LoadCustomKernelLib(const std::string& dso_lib_path); + +// Load custom kernel api: register kernel after user compiled +void LoadOpKernelInfoAndRegister(const std::string& dso_name); + +// Register custom kernel api: register kernel directly +void RegisterKernelWithMetaInfoMap( + const paddle::OpKernelInfoMap& op_kernel_info_map); + +// Interface for selective register custom kernel. +void RegisterKernelWithMetaInfo( + const std::vector& op_kernel_infos); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/fluid/framework/custom_kernel_test.cc new file mode 100644 index 00000000000..708b7bbe8a5 --- /dev/null +++ b/paddle/fluid/framework/custom_kernel_test.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include "paddle/fluid/framework/custom_kernel.h" + +#include +#include +#include "paddle/extension.h" +#include "paddle/fluid/framework/op_kernel_info_helper.h" +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_context.h" +#include "paddle/pten/core/kernel_factory.h" +#include "paddle/pten/infermeta/binary.h" +#include "paddle/utils/small_vector.h" + +#ifdef _LINUX +// user kernel function +namespace custom_kernel { + +// Here we use dot for test +// This test will fail when these two kernels are aupported in framework +// input 3: two Tensors and one std::vector +// attribute 11: fake_attributes +// output 2: one Tensor* and one std::vector +template +void FakeDot(const paddle::CPUContext& dev_ctx, const paddle::Tensor& x, + const paddle::Tensor& y, + const std::vector& fake_input_vec, + bool fake_attr_bool, int fake_attr_int, float fake_attr_float, + double fake_attr_double, int64_t fake_attr_int64, + pten::dtype::float16 fake_attr_f16, pten::DataType fake_attr_dtype, + const pten::Scalar& fake_attr_scalar, + const pten::ScalarArray& fake_attr_scalar_array, + const std::vector& fake_attr_int64_vec, + const std::vector& fake_attr_int_vec, paddle::Tensor* out, + std::vector fake_out_vec) { + // print param info + std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl; + std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl; + std::cout << "fake_attr_int: " << fake_attr_int << std::endl; + std::cout << "fake_attr_float: " << fake_attr_float << std::endl; + std::cout << "fake_attr_double: " << fake_attr_double << std::endl; + std::cout << "fake_attr_int64: " << fake_attr_int64 << std::endl; + std::cout << "fake_attr_f16: " << fake_attr_f16 << std::endl; + std::cout << "fake_attr_dtype: " << fake_attr_dtype << std::endl; + std::cout << "fake_attr_int64_vec: " << fake_attr_int64_vec.size() + << std::endl; + std::cout << "fake_attr_int_vec: " << fake_attr_int_vec.size() << std::endl; + std::cout << "fake_out_vec: " << fake_out_vec.size() << std::endl; + + // assert check + assert(fake_input_vec.size() == 2); + assert(fake_attr_bool == false); + assert(fake_attr_int == 1); + assert(fake_attr_float == 2); + assert(fake_attr_double == 3); + assert(fake_attr_int64 == 4); + assert(fake_attr_f16 == 5); + assert(fake_attr_dtype == pten::DataType::UINT32); + assert(fake_attr_int64_vec.size() == 0); + assert(fake_attr_int_vec.size() == 0); + assert(fake_out_vec.size() == 2); + + auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; + auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; + auto* z = out->mutable_data(paddle::PlaceType::kCPU); + auto shape = x.shape(); + auto const N = x.numel(); + auto const B = shape[shape.size() - 1]; + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++); + z[j] = ss; + } +} +} // namespace custom_kernel + +PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, UINT8, + custom_kernel::FakeDot) { + /* do some args define here + * the only param can be used is OpKernelInfo* kernel */ + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UINT8); +} + +// Upper code will store dot kernels info into OpKernelInfoMap +TEST(CustomKernel, custom_kernel_dot) { + std::string op_name = "dot"; + pten::Backend backend = pten::Backend::CPU; + pten::DataLayout layout = pten::DataLayout::ANY; + pten::DataType dtype = pten::DataType::UINT8; + + // 1.custom kernel info parsed and store + EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find("dot") != + paddle::OpKernelInfoMap::Instance().GetMap().end()); + + // 2.info check + EXPECT_EQ( + 1, static_cast(paddle::OpKernelInfoMap::Instance()["dot"].size())); + EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetBackend() == + backend); + EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataLayout() == + layout); + EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataType() == + dtype); + + // 3.register + EXPECT_TRUE(pten::KernelFactory::Instance().kernels().end() != + pten::KernelFactory::Instance().kernels().find("dot")); + + pten::KernelKey kernel_key(backend, layout, dtype); + EXPECT_TRUE( + pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) == + pten::KernelFactory::Instance().kernels()["dot"].end()); + + paddle::framework::RegisterKernelWithMetaInfoMap( + paddle::OpKernelInfoMap::Instance()); + + EXPECT_TRUE( + pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) != + pten::KernelFactory::Instance().kernels()["dot"].end()); + + // 4.kernel select + auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError( + op_name, kernel_key); + + // 5.prepare parameters for kernel + const auto alloc = std::make_unique( + paddle::platform::CPUPlace()); + auto dense_x = std::make_shared( + alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8, + paddle::framework::make_ddim({2, 3}), + pten::DataLayout::NCHW)); + auto* dense_x_data = + dense_x->mutable_data(paddle::platform::CPUPlace()); + + auto dense_y = std::make_shared( + alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8, + paddle::framework::make_ddim({2, 3}), + pten::DataLayout::NCHW)); + auto* dense_y_data = + dense_y->mutable_data(paddle::platform::CPUPlace()); + + // dot x,y and result + uint8_t sum[2] = {0, 0}; + for (size_t i = 0; i < 2; ++i) { + for (size_t j = 0; j < 3; ++j) { + dense_x_data[i * 3 + j] = (i * 3 + j); + dense_y_data[i * 3 + j] = (i * 3 + j); + sum[i] += (i * 3 + j) * (i * 3 + j); + } + } + + // 6.prepare kernel_context + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(paddle::platform::CPUPlace()); + auto kernel_context = pten::KernelContext(dev_ctx); + kernel_context.EmplaceBackInput(dense_x.get()); // idx:0, index:[0,1) + kernel_context.EmplaceBackInput(dense_y.get()); // idx:1, index:[1,2) + + // fake_input_vec: idx:2, index:[2,4) + size_t fake_input_vec_idx = 2; + size_t fake_input_vec_index_start = 2; + size_t fake_input_vec_index_end = 4; + kernel_context.EmplaceBackInputWithoutSetRange(dense_x.get()); + kernel_context.EmplaceBackInputWithoutSetRange(dense_y.get()); + kernel_context.AssignInputRange( + std::make_pair(fake_input_vec_index_start, fake_input_vec_index_end), + fake_input_vec_idx); + + bool fake_attr_bool = false; + int fake_attr_int = 1; + float fake_attr_float = 2.0; + double fake_attr_double = 3.0; + int64_t fake_attr_int64 = 4; + pten::dtype::float16 fake_attr_f16 = pten::dtype::float16(5); + pten::DataType fake_attr_dtype = pten::DataType::UINT32; + paddle::framework::LoDTensor tmp_tensor; + tmp_tensor.mutable_data({1}, pten::TransToFluidPlace(backend)); + pten::Scalar fake_attr_scalar = + paddle::experimental::MakePtenScalar(tmp_tensor); + pten::ScalarArray fake_attr_scalar_array; + std::vector fake_attr_int64_vec; + std::vector fake_attr_int_vec; + + kernel_context.EmplaceBackAttr(fake_attr_bool); + kernel_context.EmplaceBackAttr(fake_attr_int); + kernel_context.EmplaceBackAttr(fake_attr_float); + kernel_context.EmplaceBackAttr(fake_attr_double); + kernel_context.EmplaceBackAttr(fake_attr_int64); + kernel_context.EmplaceBackAttr(fake_attr_f16); + kernel_context.EmplaceBackAttr(fake_attr_dtype); + kernel_context.EmplaceBackAttr(fake_attr_scalar); + kernel_context.EmplaceBackAttr(fake_attr_scalar_array); + kernel_context.EmplaceBackAttr(fake_attr_int64_vec); + kernel_context.EmplaceBackAttr(fake_attr_int_vec); + + auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta()); + auto dense_out = std::make_shared( + pten::make_intrusive( + pten::TransToFluidPlace(backend)), + std::move(out_meta)); + kernel_context.EmplaceBackOutput(dense_out.get()); // idx:0 index:[0,1) + + // fake_input_vec: idx:1, index:[1,3) + size_t fake_out_vec_idx = 1; + size_t fake_out_vec_index_start = 1; + size_t fake_out_vec_index_end = 3; + kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get()); + kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get()); + kernel_context.AssignOutputRange( + std::make_pair(fake_out_vec_index_start, fake_out_vec_index_end), + fake_out_vec_idx); + + // 7.kernel call + kernel(&kernel_context); + + // 8.check result + ASSERT_EQ(dense_out->dims().size(), 2); + ASSERT_EQ(dense_out->dims()[0], 2); + ASSERT_EQ(dense_out->numel(), 2); + ASSERT_EQ(dense_out->dtype(), pten::DataType::UINT8); + ASSERT_EQ(dense_out->layout(), pten::DataLayout::NCHW); + ASSERT_EQ(dense_out->initialized(), true); + + auto expect_result = sum; + auto actual_result0 = dense_out->data()[0]; + auto actual_result1 = dense_out->data()[1]; + ASSERT_EQ(expect_result[0], actual_result0); + ASSERT_EQ(expect_result[1], actual_result1); +} + +// test OpKernelInfoHelper +TEST(OpKernelInfoHelper, op_kernel_info_help_getters) { + using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper; + std::string op_name = "dot"; + pten::Backend backend = pten::Backend::CPU; + pten::DataLayout layout = pten::DataLayout::ANY; + pten::DataType dtype = pten::DataType::UINT8; + + auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0]; + + EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info)); + EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info)); + EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info)); + EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info)); + + EXPECT_EQ(pten::KernelKey(backend, layout, dtype), + OpKernelInfoHelper::GetKernelKey(op_kernel_info)); + + paddle::CustomKernelFunc kernel_fn = + PD_PT_KERNEL(custom_kernel::FakeDot); + EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info)); + + void* variadic_func = PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot); + EXPECT_EQ(variadic_func, + OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info)); + + auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info); + auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info); + auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info); + EXPECT_EQ(3, static_cast(input_defs.size())); + EXPECT_EQ(2, static_cast(output_defs.size())); + EXPECT_EQ(11, static_cast(attribute_defs.size())); +} +#endif diff --git a/paddle/fluid/framework/op_kernel_info_helper.h b/paddle/fluid/framework/op_kernel_info_helper.h new file mode 100644 index 00000000000..271ac04bb19 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_info_helper.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/api/ext/op_kernel_info.h" +#include "paddle/pten/core/kernel_factory.h" + +namespace paddle { +namespace framework { + +class OpKernelInfoHelper { + public: + static const std::string& GetOpName(const paddle::OpKernelInfo& info) { + return info.op_name_; + } + + static const pten::Backend& GetBackend(const paddle::OpKernelInfo& info) { + return info.backend_; + } + + static const pten::DataLayout& GetDataLayout( + const paddle::OpKernelInfo& info) { + return info.layout_; + } + + static const pten::DataType& GetDataType(const paddle::OpKernelInfo& info) { + return info.dtype_; + } + + static pten::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) { + return pten::KernelKey(info.backend_, info.layout_, info.dtype_); + } + + static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) { + return info.kernel_fn_; + } + + static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) { + return info.variadic_kernel_fn_; + } + + static const paddle::SmallVector& GetInputDefs( + const paddle::OpKernelInfo& info) { + return info.input_defs_; + } + + static const paddle::SmallVector& GetOutputDefs( + const paddle::OpKernelInfo& info) { + return info.output_defs_; + } + + static const paddle::SmallVector& GetAttributeDefs( + const paddle::OpKernelInfo& info) { + return info.attribute_defs_; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 53b92c13363..6c465e62780 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -30,14 +30,15 @@ cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) +set(paddle_inference_api_deps lod_tensor scope reset_tensor_array + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel) + if(WITH_CRYPTO) - cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator) -else() - cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) + list(APPEND paddle_inference_api_deps paddle_crypto) endif() +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps}) + if(WIN32) target_link_libraries(paddle_inference_api gflags) endif() diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index eb7057bcd50..a151c824a22 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -116,9 +116,12 @@ endif() cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) +# seperate init from device_context to avoid cycle dependencies +cc_library(init SRCS init.cc DEPS device_context custom_kernel) + # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} +cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context) if(WITH_XPU) diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index e9d2f8e901e..f7a86e5aac7 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -53,6 +53,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif +#include "paddle/fluid/framework/custom_kernel.h" + DECLARE_int32(paddle_num_threads); PADDLE_DEFINE_EXPORTED_int32( multiple_of_cupti_buffer_size, 1, @@ -224,6 +226,18 @@ void InitDevices(const std::vector devices) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) places.emplace_back(platform::CUDAPinnedPlace()); #endif + const char *custom_kernel_root_p = std::getenv("CUSTOM_DEVICE_ROOT"); + if (!custom_kernel_root_p) { + VLOG(3) << "Env [CUSTOM_DEVICE_ROOT] is not set."; + } else { + std::string custom_kernel_root(custom_kernel_root_p); + if (!custom_kernel_root.empty()) { + LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root; + framework::LoadCustomKernel(custom_kernel_root); + } else { + VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty."; + } + } platform::DeviceContextPool::Init(places); #ifndef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 08ca575c2b9..a6e155f70e6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,4 +1,4 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune +set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper prune feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h index 4451a5c372b..5744b18c4d2 100644 --- a/paddle/pten/api/all.h +++ b/paddle/pten/api/all.h @@ -40,6 +40,7 @@ limitations under the License. */ #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/api/ext/dll_decl.h" #include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/api/ext/op_kernel_info.h" #include "paddle/pten/api/ext/op_meta_info.h" #include "paddle/pten/api/ext/place.h" #include "paddle/pten/api/ext/tensor_compat.h" diff --git a/paddle/pten/api/ext/op_kernel_info.h b/paddle/pten/api/ext/op_kernel_info.h new file mode 100644 index 00000000000..bcfff61bc6f --- /dev/null +++ b/paddle/pten/api/ext/op_kernel_info.h @@ -0,0 +1,663 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/pten/api/ext/dll_decl.h" +#include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/api/ext/op_meta_info.h" +#include "paddle/pten/api/include/tensor.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/common/scalar_array.h" +#include "paddle/utils/any.h" +#include "paddle/utils/small_vector.h" + +/** + * Custom Kernel Info Define. + * + * Used to maintain custom kernel core information before registering. + * Pten is working on exposing headers, custom kernel depends on them, and + * we prefer outer users following pten-kernel-function-style and registering + * macro. So, we have to re-implement some structs or class and functions to + * make sure users' custom kernel functions can be registered to pten. + * + * TODO(Aganlengzi): We should upgrade following pten. + */ + +namespace paddle { +namespace framework { +class PADDLE_API OpKernelInfoHelper; +} // namespace framework + +// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting +// before pten::DeviceContext is exposed. +class DeviceContext { + public: + DeviceContext() { stream_ = nullptr; } + void set_stream(void* stream) { stream_ = stream; } + void* stream() const { return stream_; } + + private: + void* stream_; +}; +class CPUContext : public DeviceContext {}; + +// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed +using Tensor = paddle::experimental::Tensor; +using Scalar = pten::Scalar; +using ScalarArray = pten::ScalarArray; + +// Record custom kernel core information +// We can not use pten::KernelFn directly, so users' custom kernel function +// is signatured to `CustomKernelFunc', notice that the first parameter is +// fixed to `const DeviceContext&'. +using CustomKernelFunc = + void (*)(const DeviceContext& dev_ctx, + const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + std::vector* outputs, + std::vector>* vec_outputs); + +////////////////////// Kernel Function (PD_PT_KERNEL) //////////////////////// +#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx) \ + template \ + struct CustomComputeCallHelper { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + static_assert(in_idx == 0, \ + "Kernel's DeviceContext should appear before Inputs."); \ + static_assert(vec_in_idx == 0, \ + "Kernel's DeviceContext should appear before Inputs."); \ + static_assert( \ + attr_idx == 0, \ + "Kernel's DeviceContext should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's DeviceContext should appear before Outputs."); \ + static_assert(vec_out_idx == 0, \ + "Kernel's DeviceContext should appear before Outputs."); \ + const device_ctx& arg = static_cast(dev_ctx); \ + CustomComputeCallHelper::template Compute( \ + dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } \ + } + +#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type) \ + template \ + struct CustomComputeCallHelper { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + static_assert(vec_out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const Tensor& arg = inputs[in_idx]; \ + CustomComputeCallHelper::template Compute( \ + dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } \ + } + +#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ + template \ + struct CustomComputeCallHelper&, Tail...> { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + static_assert(vec_out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::vector& arg = vec_inputs[vec_in_idx]; \ + CustomComputeCallHelper::template Compute( \ + dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } \ + } + +#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ + template \ + struct CustomComputeCallHelper { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + static_assert(out_idx == 0, \ + "Kernel's Attributes should appear before Outputs."); \ + static_assert(vec_out_idx == 0, \ + "Kernel's Attributes should appear before Outputs."); \ + try { \ + attr_type arg = paddle::any_cast(attrs[attr_idx]); \ + return CustomComputeCallHelper::template Compute< \ + dev_ctx_idx, \ + in_idx, \ + vec_in_idx, \ + attr_idx + 1, \ + out_idx, \ + vec_out_idx>(dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } catch (paddle::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator. Expected " #attr_type \ + " value."); \ + } \ + } \ + } + +#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type) \ + template \ + struct CustomComputeCallHelper { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + tensor_type* arg = (*outputs)[out_idx]; \ + CustomComputeCallHelper::template Compute( \ + dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } \ + } + +#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \ + template \ + struct CustomComputeCallHelper, Tail...> { \ + template \ + static void Compute(const DeviceContext& dev_ctx, \ + const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + std::vector* outputs, \ + std::vector>* vec_outputs, \ + PreviousArgs... pargs) { \ + std::vector arg = (*vec_outputs)[vec_out_idx]; \ + CustomComputeCallHelper::template Compute( \ + dev_ctx, \ + inputs, \ + vec_inputs, \ + attrs, \ + outputs, \ + vec_outputs, \ + pargs..., \ + arg); \ + } \ + } + +template +struct PtenTypeTag {}; + +template +struct CustomKernelFuncImpl; + +template +struct CustomKernelFuncImpl { + static void Compute(const DeviceContext& dev_ctx, + const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + std::vector* outputs, + std::vector>* vec_outputs) { + CustomComputeCallHelper>:: + template Compute<0, 0, 0, 0, 0, 0>( + dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs); + } + + // NOTE: Tensor in args is paddle::Tensor but not DenseTensor + static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) { + return impl_fn(static_cast(dev_ctx), std::forward(args)...); + } + + private: + template + struct CustomComputeCallHelper; + + /* DeviceContext Helpers */ + PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext); + + /* Input Helpers */ + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor); + + /* Attribute Helpers */ + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(pten::dtype::float16); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + + /* Output Helpers */ + PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor); + + // End: base template + template + struct CustomComputeCallHelper> { + template + static void Compute(const DeviceContext& dev_ctx, + const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + std::vector* outputs, + std::vector>* vec_outputs, + DevCtx device_ctx, + Args... args) { + return impl_fn(device_ctx, args...); + } + }; +}; + +#define PD_PT_KERNEL(...) \ + ::paddle::CustomKernelFuncImpl::Compute + +#define PD_PT_VARIADIC_KERNEL(...) \ + reinterpret_cast( \ + &::paddle::CustomKernelFuncImpl::VariadicCompute) + +////////////////////// Op Kernel Info depended structs ////////////////////// +// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily. +// TensorArgDef follows pten::TensorArgDef in kernel_factory.h, the +// difference is that custom_kernel needs extra `is_vector' to ensure we can +// deal with case like vector with only one element. +struct TensorArgDef { + pten::Backend backend; + pten::DataLayout layout; + pten::DataType dtype; + bool is_vector{false}; + + TensorArgDef(pten::Backend in_backend, + pten::DataLayout in_layout, + pten::DataType in_dtype, + bool is_vector = false) + : backend(in_backend), + layout(in_layout), + dtype(in_dtype), + is_vector(is_vector) {} + + TensorArgDef& SetBackend(pten::Backend in_backend) { + backend = in_backend; + return *this; + } + + TensorArgDef& SetDataLayout(pten::DataLayout in_layout) { + layout = in_layout; + return *this; + } + + TensorArgDef& SetDataType(pten::DataType in_dtype) { + dtype = in_dtype; + return *this; + } +}; + +// AttributeArgDef follows pten::AttributeArgDef in kernel_factory.h +struct AttributeArgDef { + std::type_index type_index; + + explicit AttributeArgDef(std::type_index type_index) + : type_index(type_index) {} +}; + +////////////////////// Op Kernel Info ////////////////////// +// OpKernelInfo stores all info parsed from user kernel function, includes: +// 0. op_name and kernel key(backend, data_layout and data_type) +// 1. unified custom kernel function +// 2. variadic kernel function(use paddle::Tensor) +// 3. args info and user defined change for specific arg +class PADDLE_API OpKernelInfo { + public: + explicit OpKernelInfo(const std::string& op_name, + pten::Backend backend, + pten::DataLayout data_layout, + pten::DataType data_type) + : op_name_(op_name), + backend_(backend), + layout_(data_layout), + dtype_(data_type) {} + + // format: PD_PT_KERNEL(...) + OpKernelInfo& SetKernelFn(CustomKernelFunc&& func); + // format: PD_PT_VARIADIC_KERNEL(...) + OpKernelInfo& SetVariadicKernelFn(void* func); + + // for Args parsing and storing + void AppendInput(pten::Backend backend, + pten::DataLayout layout, + pten::DataType dtype, + bool is_vector = false) { + input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector)); + } + + void AppendOutput(pten::Backend backend, + pten::DataLayout layout, + pten::DataType dtype, + bool is_vector = false) { + output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector)); + } + + void AppendAttribute(std::type_index type_index) { + attribute_defs_.emplace_back(AttributeArgDef(type_index)); + } + + // for Args user-def function + TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); } + TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); } + + const pten::Backend& GetBackend() const { return backend_; } + const pten::DataLayout& GetDataLayout() const { return layout_; } + const pten::DataType& GetDataType() const { return dtype_; } + + private: + friend class framework::OpKernelInfoHelper; + + // 1. op info + std::string op_name_; + + // 2. kernel key info + pten::Backend backend_{pten::Backend::UNDEFINED}; + pten::DataLayout layout_{pten::DataLayout::UNDEFINED}; + pten::DataType dtype_{pten::DataType::UNDEFINED}; + + // 3. args info + paddle::SmallVector input_defs_{{}}; + paddle::SmallVector output_defs_{{}}; + paddle::SmallVector attribute_defs_{{}}; + + // 4. func info + CustomKernelFunc kernel_fn_{nullptr}; + void* variadic_kernel_fn_{nullptr}; +}; + +////////////////////// Op Kernel Args Parser ////////////////////// +// Define CustomKernelArgsParseFunctor for args parsing +// We have to store parsed info into OpKernelInfo before +// mapping to pten::KernelArgsDef in pten::Kernel +template +struct CustomKernelArgsParseFunctor; + +template +struct CustomKernelArgsParseFunctor { + using Args = std::tuple; + enum : std::size_t { Arity = sizeof...(Args_) }; + using Indices = std::make_index_sequence; + template + using Arg = typename std::tuple_element::type; + + static void Parse(OpKernelInfo* op_kernel_info) { + const pten::Backend& backend = op_kernel_info->GetBackend(); + const pten::DataLayout& layout = op_kernel_info->GetDataLayout(); + const pten::DataType& dtype = op_kernel_info->GetDataType(); + + auto default_tensor_layout = pten::DataLayout::NCHW; + if (layout != pten::DataLayout::ANY) { + default_tensor_layout = layout; + } + auto args_type = ParseArgType(Indices{}); + for (auto arg_type : args_type) { + if (arg_type == std::type_index(typeid(const CPUContext&))) { + // do nothing, skip context arg now + } else if (arg_type == std::type_index(typeid(const Tensor&))) { + op_kernel_info->AppendInput(backend, default_tensor_layout, dtype); + } else if (arg_type == + std::type_index(typeid(const std::vector&))) { + op_kernel_info->AppendInput( + backend, default_tensor_layout, dtype, true); + } else if (arg_type == std::type_index(typeid(Tensor*))) { + op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype); + } else if (arg_type == std::type_index(typeid(std::vector))) { + op_kernel_info->AppendOutput( + backend, default_tensor_layout, dtype, true); + } else { + op_kernel_info->AppendAttribute(arg_type); + } + } + } + + private: + template + static std::vector ParseArgType( + std::index_sequence) { + return {std::type_index(typeid(Arg))...}; + } +}; + +#define PD_PT_ARGS_PARSE(...) \ + ::paddle::CustomKernelArgsParseFunctor::Parse + +//////////////// Op Kernel Info Map ///////////////// +// all user custom kernels information are stored in this map +class PADDLE_API OpKernelInfoMap { + public: + static OpKernelInfoMap& Instance() { + static OpKernelInfoMap g_custom_kernel_info_map; + return g_custom_kernel_info_map; + } + + std::vector& operator[](const std::string& name); + + const std::unordered_map>& GetMap() + const; + + private: + OpKernelInfoMap() = default; + std::unordered_map> map_; + + PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap); +}; + +//////////////// Op Kernel Info Builder ///////////////// +// format: PD_PT_ARGS_PARSE(...) +using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info); +using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel); + +class PADDLE_API OpKernelInfoBuilder { + public: + explicit OpKernelInfoBuilder(std::string&& op_name, + pten::Backend backend, + pten::DataLayout data_layout, + pten::DataType data_type); + + OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func); + OpKernelInfoBuilder& SetVariadicKernelFn(void* func); + OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func); + OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func); + + private: + // op name + std::string op_name_; + + // kernel key info + pten::Backend backend_{pten::Backend::UNDEFINED}; + pten::DataLayout layout_{pten::DataLayout::UNDEFINED}; + pten::DataType dtype_{pten::DataType::UNDEFINED}; + + // ref current info ptr + OpKernelInfo* info_ptr_; +}; +/////////////////////// Custom kernel register API ///////////////////////// +// For inference: compile directly with framework +// Call after PD_REGISTER_KERNEL(...) +void RegisterAllCustomKernel(); + +// Using this api to load compiled custom kernel's dynamic library and +// register custom kernels +void LoadCustomKernelLib(const std::string& dso_name); + +//////////////// Custom kernel register macro ///////////////// +#define PD_BACKEND(arg__) pten::Backend::arg__ +#define PD_DATALAYOUT(arg__) pten::DataLayout::arg__ +#define PD_DATATYPE(arg__) pten::DataType::arg__ + +#define PD_REGISTER_KERNEL(name, backend, layout, dtype, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_kernel__##name##_##backend##_##layout##_##dtype, \ + "PD_REGISTER_KERNEL must be called in global namespace."); \ + void __PD_USER_args_def_##name##_##backend##_##layout_##dtype( \ + ::paddle::OpKernelInfo* op_kernel_info); \ + static ::paddle::OpKernelInfoBuilder \ + __op_kernel_info_##name##_##backend##_##layout##_##dtype = \ + ::paddle::OpKernelInfoBuilder(#name, \ + PD_BACKEND(backend), \ + PD_DATALAYOUT(layout), \ + PD_DATATYPE(dtype)) \ + .SetKernelFn(PD_PT_KERNEL(func)) \ + .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(func)) \ + .ArgsParse(PD_PT_ARGS_PARSE(func)) \ + .ArgsDef( \ + &__PD_USER_args_def_##name##_##backend##_##layout_##dtype); \ + void __PD_USER_args_def_##name##_##backend##_##layout_##dtype( \ + ::paddle::OpKernelInfo* kernel) + +} // namespace paddle diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt index d3088c44834..3fe4baca773 100644 --- a/paddle/pten/api/lib/CMakeLists.txt +++ b/paddle/pten/api/lib/CMakeLists.txt @@ -3,16 +3,17 @@ add_subdirectory(utils) cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) if (WITH_GPU) - nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) + nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api) elseif (WITH_ROCM) - hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) + hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api) else() - cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) + cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api) endif() cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory) cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor) +cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor) # forward api file set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py) diff --git a/paddle/pten/api/lib/op_kernel_info.cc b/paddle/pten/api/lib/op_kernel_info.cc new file mode 100644 index 00000000000..db474d457c3 --- /dev/null +++ b/paddle/pten/api/lib/op_kernel_info.cc @@ -0,0 +1,114 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/api/ext/op_kernel_info.h" +#include "paddle/fluid/framework/custom_kernel.h" + +namespace paddle { + +////////////////////// Op Kernel Info ////////////////////// + +OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) { + kernel_fn_ = std::forward(func); + return *this; +} + +OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) { + variadic_kernel_fn_ = func; + return *this; +} + +//////////////// Op Kernel Info Map ///////////////// + +std::vector& OpKernelInfoMap::operator[]( + const std::string& name) { + return map_[name]; +} + +const std::unordered_map>& +OpKernelInfoMap::GetMap() const { + return map_; +} + +//////////////// Op Kernel Info Builder ///////////////// + +OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name, + pten::Backend backend, + pten::DataLayout data_layout, + pten::DataType data_type) { + // 1. member assign + op_name_ = std::forward(op_name); + backend_ = backend; + layout_ = data_layout; + dtype_ = data_type; + + // 2. info parse + auto& info_vector = OpKernelInfoMap::Instance()[op_name_]; + auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_); + info_vector.emplace_back(std::move(op_kernel_info)); + + // 3. get current info ptr + info_ptr_ = &(info_vector.back()); +} + +OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) { + info_ptr_->SetKernelFn(std::forward(func)); + return *this; +} + +OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) { + info_ptr_->SetVariadicKernelFn(func); + return *this; +} + +OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse( + CustomKernelArgsParseFn func) { + func(this->info_ptr_); + return *this; +} + +OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) { + func(this->info_ptr_); + return *this; +} + +/////////////////////// Op register API ///////////////////////// + +// For inference: compile directly with framework +// Call after PD_REGISTER_KERNEL(...) +void RegisterAllCustomKernel() { + auto& op_kernel_info_map = OpKernelInfoMap::Instance(); + framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map); +} + +// Using this api to load compiled custom kernel's dynamic library and +// register custom kernels +void LoadCustomKernelLib(const std::string& dso_name) { + framework::LoadCustomKernelLib(dso_name); +} + +} // namespace paddle + +#ifdef __cplusplus +extern "C" { +#endif + +// C-API to get global OpKernelInfoMap. +paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() { + return paddle::OpKernelInfoMap::Instance(); +} + +#ifdef __cplusplus +} // end extern "C" +#endif diff --git a/paddle/pten/core/kernel_def.h b/paddle/pten/core/kernel_def.h index 3884bb55e47..9b91720d86f 100644 --- a/paddle/pten/core/kernel_def.h +++ b/paddle/pten/core/kernel_def.h @@ -26,7 +26,7 @@ class KernelSignature; class ArgumentMappingContext; class InferMetaContext; -using KernelFn = void (*)(KernelContext* ctx); +using KernelFn = std::function; using KernelArgsDefFn = void (*)(Kernel* kernel); using KernelArgsParseFn = void (*)(const KernelKey& default_key, KernelArgsDef* args_def); diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h index 25e3439a640..b21c71f3fa1 100644 --- a/paddle/pten/core/kernel_factory.h +++ b/paddle/pten/core/kernel_factory.h @@ -49,8 +49,6 @@ using DataLayout = paddle::experimental::DataLayout; class KernelContext; -using KernelFn = void (*)(KernelContext* ctx); - class KernelKey { public: KernelKey() = default; diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index cd8ce07f800..4208132b980 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -1,6 +1,6 @@ # for paddle test case if(WITH_TESTING) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags) endif() cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags) diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 03b9ad7fc2d..5e023e9248c 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -371,6 +371,17 @@ if load_noavx: raise e +def set_paddle_custom_device_lib_path(lib_path): + if os.environ.get('CUSTOM_DEVICE_ROOT', None) is not None: + # use setted environment value + return + if os.path.exists(lib_path): + # set CUSTOM_DEVICE_ROOT default path + os.environ['CUSTOM_DEVICE_ROOT'] = os.path.normpath(lib_path) + else: + os.environ['CUSTOM_DEVICE_ROOT'] = '' + + # set paddle lib path def set_paddle_lib_path(): site_dirs = site.getsitepackages() if hasattr( @@ -380,11 +391,15 @@ def set_paddle_lib_path(): lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs']) if os.path.exists(lib_dir): _set_paddle_lib_path(lib_dir) + set_paddle_custom_device_lib_path( + os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])) return if hasattr(site, 'USER_SITE'): lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs']) if os.path.exists(lib_dir): _set_paddle_lib_path(lib_dir) + set_paddle_custom_device_lib_path( + os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])) set_paddle_lib_path() diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index d73c4e3acb9..587d4aee34c 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -9,5 +9,6 @@ endforeach() add_subdirectory(unittests) add_subdirectory(book) add_subdirectory(custom_op) +add_subdirectory(custom_kernel) set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt new file mode 100644 index 00000000000..b2bdfac9080 --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt @@ -0,0 +1,2 @@ +py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py) +py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py) diff --git a/python/paddle/fluid/tests/custom_kernel/__init__.py b/python/paddle/fluid/tests/custom_kernel/__init__.py new file mode 100644 index 00000000000..97043fd7ba6 --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc new file mode 100644 index 00000000000..e61b7314ef6 --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +namespace paddle { + +namespace custom_kernel { + +// Here we use dot for test +// This test will fail when this kernel is supported in framework +template +void Dot(const paddle::CPUContext& dev_ctx, + const paddle::Tensor& x, + const paddle::Tensor& y, + paddle::Tensor* out) { + auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; + auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; + auto* z = out->mutable_data(paddle::PlaceType::kCPU); + + // Loop over the total N elements of both operands while sum-reducing every + // B pairs along the way where B is the dimension of the least ordered axis + auto shape = x.shape(); + auto const N = x.numel(); + auto const B = shape[shape.size() - 1]; + + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++); + z[j] = ss; + } +} + +} // namespace custom_kernel +} // namespace paddle + +PD_REGISTER_KERNEL( + dot, CPU, ALL_LAYOUT, INT8, paddle::custom_kernel::Dot) { + /* do some args define here + * the only param can be used is OpKernelInfo* kernel */ + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8); +} diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py new file mode 100644 index 00000000000..5e3bd2f8ed9 --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from paddle.fluid import core +from distutils.sysconfig import get_python_lib +from distutils.core import setup, Extension + +# cc flags +paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC'] +if core.is_compiled_with_npu(): + paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] + +# include path +site_packages_path = get_python_lib() +paddle_custom_kernel_include = [ + os.path.join(site_packages_path, 'paddle', 'include'), +] + +# libs path +paddle_custom_kernel_library_dir = [ + os.path.join(site_packages_path, 'paddle', 'fluid'), +] + +# libs +libs = [':core_avx.so'] +if not core.has_avx_core and core.has_noavx_core: + libs = [':core_noavx.so'] + +custom_kernel_dot_module = Extension( + 'custom_kernel_dot', + sources=['custom_kernel_dot.cc'], + include_dirs=paddle_custom_kernel_include, + library_dirs=paddle_custom_kernel_library_dir, + libraries=libs, + extra_compile_args=paddle_extra_compile_args) + +setup( + name='custom_kernel_dot', + version='1.0', + description='custom kernel fot compiling', + ext_modules=[custom_kernel_dot_module]) diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py new file mode 100644 index 00000000000..13d8a29e71b --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import site +import unittest +import numpy as np + + +# use dot as test case. +class TestCustomKernelDot(unittest.TestCase): + def setUp(self): + # compile so and set to current path + cur_dir = os.path.dirname(os.path.abspath(__file__)) + + # --inplace to place output so file to current dir + cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format( + cur_dir, sys.executable) + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir + + def test_custom_kernel_dot_run(self): + # test dot run + x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) + y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) + result = np.sum(x_data * y_data, axis=1).reshape([2, 1]) + + import paddle + paddle.set_device('cpu') + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + out = paddle.dot(x, y) + + self.assertTrue( + np.array_equal(out.numpy(), result), + "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(), + result)) + + def tearDown(self): + del os.environ['CUSTOM_DEVICE_ROOT'] + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py new file mode 100644 index 00000000000..1d7b29e8511 --- /dev/null +++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import site +import unittest +import numpy as np + + +class TestCustomKernelLoad(unittest.TestCase): + def setUp(self): + # compile so and set to current path + cur_dir = os.path.dirname(os.path.abspath(__file__)) + + # --inplace to place output so file to current dir + cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format( + cur_dir, sys.executable) + os.system(cmd) + + # get paddle lib path and place so + paddle_lib_path = '' + site_dirs = site.getsitepackages() if hasattr( + site, 'getsitepackages' + ) else [x for x in sys.path if 'site-packages' in x] + for site_dir in site_dirs: + lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs']) + if os.path.exists(lib_dir): + paddle_lib_path = lib_dir + break + if paddle_lib_path == '': + if hasattr(site, 'USER_SITE'): + lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs']) + if os.path.exists(lib_dir): + paddle_lib_path = lib_dir + self.default_path = os.path.sep.join( + [paddle_lib_path, '..', '..', 'paddle-plugins']) + # copy so to defalut path + cmd = 'mkdir -p {} && cp ./*.so {}'.format(self.default_path, + self.default_path) + os.system(cmd) # wait + + def test_custom_kernel_dot_load(self): + # test dot load + x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) + y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) + result = np.sum(x_data * y_data, axis=1).reshape([2, 1]) + + import paddle + paddle.set_device('cpu') + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + out = paddle.dot(x, y) + + self.assertTrue( + np.array_equal(out.numpy(), result), + "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(), + result)) + + def tearDown(self): + cmd = 'rm -rf {}'.format(self.default_path) + os.system(cmd) + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index e8cc2914521..d1c0157c2b3 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -573,7 +573,8 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) + # pten common headers # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform` # to `paddle/pten/api/ext`, - ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h']) + ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h']) if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn -- GitLab