未验证 提交 a8879215 编写于 作者: A Aganlengzi 提交者: GitHub

[PluggableDevice] Add custom kernel support based on pten kernel management (#38848)

* [Demo] custom kernel based on pten kernel

* merge and npu custom work well

* del comments

* delete other code

* fix CUDAContext

* fix not found small_vector.h

* support NPU

* fix NPUContext

* fix DeviceContext support

* add UT

* fix call

* add UT

* fix

* fix for comments and ut

* add MACRO control

* fix multi input output

* support env CUSTOM_DEVICE_ROOT

* deal with special cases

* fix for Windows

* try coverage with test_custom_kernel_dot.py

* fix test_custom_kernel_dot

* fix test_custom_kernel_dot

* fix merge

* fix merge

* fix CI

* update

* merge and fix

* remove WITH_CUSTOM_KERNEL

* fix merge

* merge and fix

* fix ut

* fix ut for mac

* add more UT

* add more UT

* fix
上级 7e6a2190
......@@ -437,11 +437,12 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)
cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
cc_library(custom_kernel SRCS custom_kernel.cc DEPS
tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)
cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
......@@ -451,3 +452,4 @@ endif()
cc_test(scope_guard_test SRCS scope_guard_test.cc)
cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
#include "paddle/fluid/framework/custom_kernel.h"
#include <dirent.h>
#include <algorithm>
#include <regex>
#include "paddle/fluid/framework/op_kernel_info_helper.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/api/ext/op_kernel_info.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/kernel_context.h"
#include "paddle/pten/core/kernel_registry.h"
DECLARE_bool(run_pten_kernel);
namespace paddle {
namespace framework {
// set pten::Kernel args_def_ from op_kernel_info
// because we can not set directly to pten::Kernel without exposing
// pten::KernelArgsDef when parsing custom user function
static void ParseArgs(const OpKernelInfo& op_kernel_info,
pten::KernelArgsDef* args_def) {
auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
for (auto& input : input_defs) {
args_def->AppendInput(input.backend, input.layout, input.dtype);
}
for (auto& output : output_defs) {
args_def->AppendOutput(output.backend, output.layout, output.dtype);
}
for (auto& attr : attribute_defs) {
args_def->AppendAttribute(attr.type_index);
}
}
// custom pten kernel call function define
static void RunKernelFunc(pten::KernelContext* ctx,
const OpKernelInfo& op_kernel_info) {
VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
// input and output size is not params' num
// but actual Tensors' size
size_t input_size = ctx->InputsSize();
size_t output_size = ctx->OutputsSize();
size_t attr_size = ctx->AttrsSize();
// parameters' num of unified user kernel function
auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
PADDLE_ENFORCE_GE(input_size, input_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx inputs size (%d) must be larger than "
"the size of kernel input_defs (%d).",
input_size, input_defs.size()));
PADDLE_ENFORCE_GE(output_size, output_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx outputs size (%d) must be larger than "
"the size of kernel output_defs (%d).",
output_size, output_defs.size()));
PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx attribute size (%d) must be equal to "
"to the size of kernel attribute_defs (%d).",
attr_size, attribute_defs.size()));
VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
<< "[tensor size:" << input_size << "]"
<< " Attribute num: " << attribute_defs.size()
<< " Output num: " << output_defs.size()
<< "[tensor size:" << output_size << "].";
// Inputs mapping
std::vector<paddle::experimental::Tensor> custom_ins;
std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
VLOG(3) << "Mapping Input[" << in_idx << "]";
const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
// is_vector tells if this Input is Tensor or std::vector<Tensor>
if (!input_defs.at(in_idx).is_vector) {
paddle::experimental::Tensor custom_t;
auto& ctx_tensor = ctx->InputAt<pten::DenseTensor>(range.first);
custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
custom_ins.emplace_back(custom_t);
} else {
std::vector<paddle::experimental::Tensor> custom_vec_in;
auto ctx_tensor_vec =
ctx->MoveInputsBetween<pten::DenseTensor>(range.first, range.second);
for (auto& ctx_tensor : ctx_tensor_vec) {
paddle::experimental::Tensor custom_t;
custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
custom_vec_in.emplace_back(custom_t);
}
custom_vec_ins.emplace_back(custom_vec_in);
}
VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
<< "," << range.second << ").";
}
// Attributes mapping
std::vector<paddle::any> custom_attrs;
for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
bool arg = ctx->AttrAt<bool>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(int))) {
int arg = ctx->AttrAt<int>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(float))) {
float arg = ctx->AttrAt<float>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(double))) {
double arg = ctx->AttrAt<double>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(int64_t))) {
int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(pten::dtype::float16))) {
pten::dtype::float16 arg = ctx->AttrAt<pten::dtype::float16>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(DataType))) {
DataType arg = ctx->AttrAt<DataType>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const Scalar&))) {
const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const std::vector<int64_t>&))) {
const std::vector<int64_t>& arg =
ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const ScalarArray&))) {
const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const std::vector<int>&))) {
const std::vector<int>& arg =
ctx->AttrAt<const std::vector<int>&>(attr_idx);
custom_attrs.emplace_back(arg);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported attribute attribute_defs[%d].type_index", attr_idx));
}
VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
}
// Outputs mapping
std::vector<paddle::experimental::Tensor*> custom_outs;
std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
std::vector<std::shared_ptr<pten::DenseTensor>> custom_outs_ptr;
std::vector<std::vector<std::shared_ptr<pten::DenseTensor>>>
custom_vec_outs_ptr;
for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
VLOG(3) << "Mapping Output[" << out_idx << "]";
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
// is_vector tells if this Output is Tensor or std::vector<Tensor>
if (!output_defs.at(out_idx).is_vector) {
auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
auto* custom_t = new paddle::experimental::Tensor();
auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
custom_t->set_impl(custom_t_ptr);
custom_outs.emplace_back(custom_t);
custom_outs_ptr.emplace_back(custom_t_ptr);
} else {
std::vector<paddle::experimental::Tensor*> custom_vec_out;
std::vector<std::shared_ptr<pten::DenseTensor>> custom_vec_out_ptr;
auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
range.first, range.second);
for (auto ctx_tensor : ctx_tensor_vec) {
auto* custom_t = new paddle::experimental::Tensor();
auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
custom_t->set_impl(custom_t_ptr);
custom_vec_out.emplace_back(custom_t);
custom_vec_out_ptr.emplace_back(custom_t_ptr);
}
custom_vec_outs.emplace_back(custom_vec_out);
custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
}
VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
<< "," << range.second << ").";
}
// DeviceContext
// In pten, the first paramter XXContext is decided when registering
// through template param, but custom kernel function use unified
// DeviceContext as first parameter of user_kernel_fn, we use backend
// from OpKernelInfo to decide XXContext. In temporary simple
// DeviceContext, we just set necessary info to dev_ctx(such as stream
// in NPUContext), more related work should be done when
// pten::DeviceContext is exposed to outer.
DeviceContext dev_ctx;
auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
if (backend == pten::Backend::CPU) {
// do nothing
} else {
LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
<< " with compiled Paddle.";
return;
}
auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
// call user function
user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
&custom_outs, &custom_vec_outs);
VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
// NOTE: Map back the output tensors with stored shared_ptrs.
for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
VLOG(3) << "Mapping Back Output[" << out_idx << "]";
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
// is_vector tells if this Output is Tensor or std::vector<Tensor>
if (!output_defs.at(out_idx).is_vector) {
auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
*ctx_tensor = *(custom_outs_ptr.back().get());
custom_outs_ptr.pop_back();
} else {
auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
range.first, range.second);
auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
*(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
custom_vec_ptr_out.pop_back();
}
custom_vec_outs_ptr.pop_back();
}
VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
<< "," << range.second << "].";
}
// delete newed paddle::Tensor for outputs while calling user kernel function
for (size_t i = 0; i < custom_outs.size(); ++i) {
delete custom_outs[i];
}
for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
delete custom_vec_outs[i][j];
}
}
}
void RegisterKernelWithMetaInfo(
const std::vector<OpKernelInfo>& op_kernel_infos) {
PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
platform::errors::Unimplemented(
"Custom Kernel depends on pten kernel enabled,"));
for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
auto& kernel_info = op_kernel_infos[i];
auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
// 1.Check whether this kernel is valid for a specific operator
PADDLE_ENFORCE_EQ(
pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
platform::errors::InvalidArgument(
"[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
op_type));
// 2.Check whether kernel_key has been already registed
PADDLE_ENFORCE_EQ(
pten::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
pten::KernelFactory::Instance().kernels()[op_type].end(),
platform::errors::InvalidArgument(
"[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
"already existed in Paddle, please contribute PR if need "
"to optimize the kernel code. Custom kernel do NOT support "
"to replace existing kernel in Paddle.",
op_type, kernel_key));
// pten::KernelFn
pten::KernelFn kernel_fn = [kernel_info](pten::KernelContext* ctx) {
VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
RunKernelFunc(ctx, kernel_info);
};
// variadic_kernel_fn
void* variadic_kernel_fn =
OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
pten::Kernel kernel(kernel_fn, variadic_kernel_fn);
// args info
ParseArgs(kernel_info, kernel.mutable_args_def());
// register custom kernel to pten::KernelFactory
pten::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
<< ">'s kernel " << kernel_key << " to Paddle. "
<< "It will be used like native ones.";
}
}
void RegisterKernelWithMetaInfoMap(
const paddle::OpKernelInfoMap& op_kernel_info_map) {
auto& kernel_info_map = op_kernel_info_map.GetMap();
VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
<< kernel_info_map.size();
// pair: {op_type, OpKernelInfo}
for (auto& pair : kernel_info_map) {
VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
RegisterKernelWithMetaInfo(pair.second);
}
}
void LoadCustomKernelLib(const std::string& dso_lib_path) {
#ifdef _LINUX
void* dso_handle = nullptr;
int dynload_flags = RTLD_NOW | RTLD_LOCAL;
dso_handle = dlopen(dso_lib_path.c_str(), dynload_flags);
// MUST valid dso_lib_path
PADDLE_ENFORCE_NOT_NULL(
dso_handle,
platform::errors::InvalidArgument(
"Fail to open library: %s with error: %s", dso_lib_path, dlerror()));
typedef OpKernelInfoMap& get_op_kernel_info_map_t();
auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
if (func == nullptr) {
LOG(INFO) << "Skipped lib [" << dso_lib_path << "]: fail to find "
<< "PD_GetOpKernelInfoMap symbol in this lib.";
return;
}
auto& op_kernel_info_map = func();
RegisterKernelWithMetaInfoMap(op_kernel_info_map);
LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
#else
VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
#endif
return;
}
// List all libs with given path
std::vector<std::string> ListAllLib(const std::string& libs_path) {
DIR* dir = nullptr;
dir = opendir(libs_path.c_str());
// MUST valid libs_path
PADDLE_ENFORCE_NOT_NULL(dir, platform::errors::InvalidArgument(
"Fail to open path: %s", libs_path));
dirent* ptr = nullptr;
std::vector<std::string> libs;
std::regex express(".*\\.so");
std::match_results<std::string::iterator> results;
while ((ptr = readdir(dir)) != nullptr) {
std::string filename(ptr->d_name);
if (std::regex_match(filename.begin(), filename.end(), results, express)) {
libs.emplace_back(libs_path + '/' + filename);
LOG(INFO) << "Found lib [" << filename << "]";
} else {
VLOG(3) << "Skipped file [" << filename << "] without .so postfix";
}
}
closedir(dir);
return libs;
}
// Load custom kernels with given path
void LoadCustomKernel(const std::string& libs_path) {
VLOG(3) << "Try loading custom libs from: [" << libs_path << "]";
std::vector<std::string> libs = ListAllLib(libs_path);
for (auto& lib_path : libs) {
LoadCustomKernelLib(lib_path);
}
LOG(INFO) << "Finished in LoadCustomKernel with libs_path: [" << libs_path
<< "]";
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/api/ext/op_kernel_info.h"
namespace paddle {
namespace framework {
// Load custom kernel lib from giwen path
void LoadCustomKernel(const std::string& libs_path);
void LoadCustomKernelLib(const std::string& dso_lib_path);
// Load custom kernel api: register kernel after user compiled
void LoadOpKernelInfoAndRegister(const std::string& dso_name);
// Register custom kernel api: register kernel directly
void RegisterKernelWithMetaInfoMap(
const paddle::OpKernelInfoMap& op_kernel_info_map);
// Interface for selective register custom kernel.
void RegisterKernelWithMetaInfo(
const std::vector<OpKernelInfo>& op_kernel_infos);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
#include "paddle/fluid/framework/custom_kernel.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/extension.h"
#include "paddle/fluid/framework/op_kernel_info_helper.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/api/lib/utils/tensor_utils.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_context.h"
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/pten/infermeta/binary.h"
#include "paddle/utils/small_vector.h"
#ifdef _LINUX
// user kernel function
namespace custom_kernel {
// Here we use dot <CPU, ANY, UINT8> for test
// This test will fail when these two kernels are aupported in framework
// input 3: two Tensors and one std::vector<Tensor>
// attribute 11: fake_attributes
// output 2: one Tensor* and one std::vector<Tensor*>
template <typename T>
void FakeDot(const paddle::CPUContext& dev_ctx, const paddle::Tensor& x,
const paddle::Tensor& y,
const std::vector<paddle::Tensor>& fake_input_vec,
bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
double fake_attr_double, int64_t fake_attr_int64,
pten::dtype::float16 fake_attr_f16, pten::DataType fake_attr_dtype,
const pten::Scalar& fake_attr_scalar,
const pten::ScalarArray& fake_attr_scalar_array,
const std::vector<int64_t>& fake_attr_int64_vec,
const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
std::vector<paddle::Tensor*> fake_out_vec) {
// print param info
std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
std::cout << "fake_attr_int: " << fake_attr_int << std::endl;
std::cout << "fake_attr_float: " << fake_attr_float << std::endl;
std::cout << "fake_attr_double: " << fake_attr_double << std::endl;
std::cout << "fake_attr_int64: " << fake_attr_int64 << std::endl;
std::cout << "fake_attr_f16: " << fake_attr_f16 << std::endl;
std::cout << "fake_attr_dtype: " << fake_attr_dtype << std::endl;
std::cout << "fake_attr_int64_vec: " << fake_attr_int64_vec.size()
<< std::endl;
std::cout << "fake_attr_int_vec: " << fake_attr_int_vec.size() << std::endl;
std::cout << "fake_out_vec: " << fake_out_vec.size() << std::endl;
// assert check
assert(fake_input_vec.size() == 2);
assert(fake_attr_bool == false);
assert(fake_attr_int == 1);
assert(fake_attr_float == 2);
assert(fake_attr_double == 3);
assert(fake_attr_int64 == 4);
assert(fake_attr_f16 == 5);
assert(fake_attr_dtype == pten::DataType::UINT32);
assert(fake_attr_int64_vec.size() == 0);
assert(fake_attr_int_vec.size() == 0);
assert(fake_out_vec.size() == 2);
auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
auto shape = x.shape();
auto const N = x.numel();
auto const B = shape[shape.size() - 1];
for (int j = 0; j < N / B; j++) {
T ss = 0;
for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
z[j] = ss;
}
}
} // namespace custom_kernel
PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, UINT8,
custom_kernel::FakeDot<uint8_t>) {
/* do some args define here
* the only param can be used is OpKernelInfo* kernel */
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UINT8);
}
// Upper code will store dot kernels info into OpKernelInfoMap
TEST(CustomKernel, custom_kernel_dot) {
std::string op_name = "dot";
pten::Backend backend = pten::Backend::CPU;
pten::DataLayout layout = pten::DataLayout::ANY;
pten::DataType dtype = pten::DataType::UINT8;
// 1.custom kernel info parsed and store
EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find("dot") !=
paddle::OpKernelInfoMap::Instance().GetMap().end());
// 2.info check
EXPECT_EQ(
1, static_cast<int>(paddle::OpKernelInfoMap::Instance()["dot"].size()));
EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetBackend() ==
backend);
EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataLayout() ==
layout);
EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataType() ==
dtype);
// 3.register
EXPECT_TRUE(pten::KernelFactory::Instance().kernels().end() !=
pten::KernelFactory::Instance().kernels().find("dot"));
pten::KernelKey kernel_key(backend, layout, dtype);
EXPECT_TRUE(
pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) ==
pten::KernelFactory::Instance().kernels()["dot"].end());
paddle::framework::RegisterKernelWithMetaInfoMap(
paddle::OpKernelInfoMap::Instance());
EXPECT_TRUE(
pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) !=
pten::KernelFactory::Instance().kernels()["dot"].end());
// 4.kernel select
auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
op_name, kernel_key);
// 5.prepare parameters for kernel
const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
auto dense_x = std::make_shared<pten::DenseTensor>(
alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
paddle::framework::make_ddim({2, 3}),
pten::DataLayout::NCHW));
auto* dense_x_data =
dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
auto dense_y = std::make_shared<pten::DenseTensor>(
alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
paddle::framework::make_ddim({2, 3}),
pten::DataLayout::NCHW));
auto* dense_y_data =
dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
// dot x,y and result
uint8_t sum[2] = {0, 0};
for (size_t i = 0; i < 2; ++i) {
for (size_t j = 0; j < 3; ++j) {
dense_x_data[i * 3 + j] = (i * 3 + j);
dense_y_data[i * 3 + j] = (i * 3 + j);
sum[i] += (i * 3 + j) * (i * 3 + j);
}
}
// 6.prepare kernel_context
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
auto kernel_context = pten::KernelContext(dev_ctx);
kernel_context.EmplaceBackInput(dense_x.get()); // idx:0, index:[0,1)
kernel_context.EmplaceBackInput(dense_y.get()); // idx:1, index:[1,2)
// fake_input_vec: idx:2, index:[2,4)
size_t fake_input_vec_idx = 2;
size_t fake_input_vec_index_start = 2;
size_t fake_input_vec_index_end = 4;
kernel_context.EmplaceBackInputWithoutSetRange(dense_x.get());
kernel_context.EmplaceBackInputWithoutSetRange(dense_y.get());
kernel_context.AssignInputRange(
std::make_pair(fake_input_vec_index_start, fake_input_vec_index_end),
fake_input_vec_idx);
bool fake_attr_bool = false;
int fake_attr_int = 1;
float fake_attr_float = 2.0;
double fake_attr_double = 3.0;
int64_t fake_attr_int64 = 4;
pten::dtype::float16 fake_attr_f16 = pten::dtype::float16(5);
pten::DataType fake_attr_dtype = pten::DataType::UINT32;
paddle::framework::LoDTensor tmp_tensor;
tmp_tensor.mutable_data<uint8_t>({1}, pten::TransToFluidPlace(backend));
pten::Scalar fake_attr_scalar =
paddle::experimental::MakePtenScalar(tmp_tensor);
pten::ScalarArray fake_attr_scalar_array;
std::vector<int64_t> fake_attr_int64_vec;
std::vector<int> fake_attr_int_vec;
kernel_context.EmplaceBackAttr(fake_attr_bool);
kernel_context.EmplaceBackAttr(fake_attr_int);
kernel_context.EmplaceBackAttr(fake_attr_float);
kernel_context.EmplaceBackAttr(fake_attr_double);
kernel_context.EmplaceBackAttr(fake_attr_int64);
kernel_context.EmplaceBackAttr(fake_attr_f16);
kernel_context.EmplaceBackAttr(fake_attr_dtype);
kernel_context.EmplaceBackAttr(fake_attr_scalar);
kernel_context.EmplaceBackAttr(fake_attr_scalar_array);
kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
kernel_context.EmplaceBackAttr(fake_attr_int_vec);
auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
auto dense_out = std::make_shared<pten::DenseTensor>(
pten::make_intrusive<paddle::experimental::SharedStorage>(
pten::TransToFluidPlace(backend)),
std::move(out_meta));
kernel_context.EmplaceBackOutput(dense_out.get()); // idx:0 index:[0,1)
// fake_input_vec: idx:1, index:[1,3)
size_t fake_out_vec_idx = 1;
size_t fake_out_vec_index_start = 1;
size_t fake_out_vec_index_end = 3;
kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
kernel_context.AssignOutputRange(
std::make_pair(fake_out_vec_index_start, fake_out_vec_index_end),
fake_out_vec_idx);
// 7.kernel call
kernel(&kernel_context);
// 8.check result
ASSERT_EQ(dense_out->dims().size(), 2);
ASSERT_EQ(dense_out->dims()[0], 2);
ASSERT_EQ(dense_out->numel(), 2);
ASSERT_EQ(dense_out->dtype(), pten::DataType::UINT8);
ASSERT_EQ(dense_out->layout(), pten::DataLayout::NCHW);
ASSERT_EQ(dense_out->initialized(), true);
auto expect_result = sum;
auto actual_result0 = dense_out->data<uint8_t>()[0];
auto actual_result1 = dense_out->data<uint8_t>()[1];
ASSERT_EQ(expect_result[0], actual_result0);
ASSERT_EQ(expect_result[1], actual_result1);
}
// test OpKernelInfoHelper
TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
std::string op_name = "dot";
pten::Backend backend = pten::Backend::CPU;
pten::DataLayout layout = pten::DataLayout::ANY;
pten::DataType dtype = pten::DataType::UINT8;
auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
EXPECT_EQ(pten::KernelKey(backend, layout, dtype),
OpKernelInfoHelper::GetKernelKey(op_kernel_info));
paddle::CustomKernelFunc kernel_fn =
PD_PT_KERNEL(custom_kernel::FakeDot<uint8_t>);
EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
void* variadic_func = PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<uint8_t>);
EXPECT_EQ(variadic_func,
OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
EXPECT_EQ(3, static_cast<int>(input_defs.size()));
EXPECT_EQ(2, static_cast<int>(output_defs.size()));
EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
}
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/api/ext/op_kernel_info.h"
#include "paddle/pten/core/kernel_factory.h"
namespace paddle {
namespace framework {
class OpKernelInfoHelper {
public:
static const std::string& GetOpName(const paddle::OpKernelInfo& info) {
return info.op_name_;
}
static const pten::Backend& GetBackend(const paddle::OpKernelInfo& info) {
return info.backend_;
}
static const pten::DataLayout& GetDataLayout(
const paddle::OpKernelInfo& info) {
return info.layout_;
}
static const pten::DataType& GetDataType(const paddle::OpKernelInfo& info) {
return info.dtype_;
}
static pten::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) {
return pten::KernelKey(info.backend_, info.layout_, info.dtype_);
}
static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) {
return info.kernel_fn_;
}
static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) {
return info.variadic_kernel_fn_;
}
static const paddle::SmallVector<TensorArgDef>& GetInputDefs(
const paddle::OpKernelInfo& info) {
return info.input_defs_;
}
static const paddle::SmallVector<TensorArgDef>& GetOutputDefs(
const paddle::OpKernelInfo& info) {
return info.output_defs_;
}
static const paddle::SmallVector<AttributeArgDef>& GetAttributeDefs(
const paddle::OpKernelInfo& info) {
return info.attribute_defs_;
}
};
} // namespace framework
} // namespace paddle
......@@ -30,14 +30,15 @@ cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg}
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
if(WITH_CRYPTO)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
else()
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
list(APPEND paddle_inference_api_deps paddle_crypto)
endif()
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps})
if(WIN32)
target_link_libraries(paddle_inference_api gflags)
endif()
......
......@@ -116,9 +116,12 @@ endif()
cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
# seperate init from device_context to avoid cycle dependencies
cc_library(init SRCS init.cc DEPS device_context custom_kernel)
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context)
if(WITH_XPU)
......
......@@ -53,6 +53,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#include "paddle/fluid/framework/custom_kernel.h"
DECLARE_int32(paddle_num_threads);
PADDLE_DEFINE_EXPORTED_int32(
multiple_of_cupti_buffer_size, 1,
......@@ -224,6 +226,18 @@ void InitDevices(const std::vector<int> devices) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
places.emplace_back(platform::CUDAPinnedPlace());
#endif
const char *custom_kernel_root_p = std::getenv("CUSTOM_DEVICE_ROOT");
if (!custom_kernel_root_p) {
VLOG(3) << "Env [CUSTOM_DEVICE_ROOT] is not set.";
} else {
std::string custom_kernel_root(custom_kernel_root_p);
if (!custom_kernel_root.empty()) {
LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
framework::LoadCustomKernel(custom_kernel_root);
} else {
VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
}
}
platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN
......
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
......
......@@ -40,6 +40,7 @@ limitations under the License. */
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/api/ext/dll_decl.h"
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/api/ext/op_kernel_info.h"
#include "paddle/pten/api/ext/op_meta_info.h"
#include "paddle/pten/api/ext/place.h"
#include "paddle/pten/api/ext/tensor_compat.h"
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <string>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include <vector>
#include "paddle/pten/api/ext/dll_decl.h"
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/api/ext/op_meta_info.h"
#include "paddle/pten/api/include/tensor.h"
#include "paddle/pten/common/scalar.h"
#include "paddle/pten/common/scalar_array.h"
#include "paddle/utils/any.h"
#include "paddle/utils/small_vector.h"
/**
* Custom Kernel Info Define.
*
* Used to maintain custom kernel core information before registering.
* Pten is working on exposing headers, custom kernel depends on them, and
* we prefer outer users following pten-kernel-function-style and registering
* macro. So, we have to re-implement some structs or class and functions to
* make sure users' custom kernel functions can be registered to pten.
*
* TODO(Aganlengzi): We should upgrade following pten.
*/
namespace paddle {
namespace framework {
class PADDLE_API OpKernelInfoHelper;
} // namespace framework
// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting
// before pten::DeviceContext is exposed.
class DeviceContext {
public:
DeviceContext() { stream_ = nullptr; }
void set_stream(void* stream) { stream_ = stream; }
void* stream() const { return stream_; }
private:
void* stream_;
};
class CPUContext : public DeviceContext {};
// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed
using Tensor = paddle::experimental::Tensor;
using Scalar = pten::Scalar;
using ScalarArray = pten::ScalarArray;
// Record custom kernel core information
// We can not use pten::KernelFn directly, so users' custom kernel function
// is signatured to `CustomKernelFunc', notice that the first parameter is
// fixed to `const DeviceContext&'.
using CustomKernelFunc =
void (*)(const DeviceContext& dev_ctx,
const std::vector<Tensor>& inputs,
const std::vector<std::vector<Tensor>>& vec_inputs,
const std::vector<paddle::any>& attrs,
std::vector<Tensor*>* outputs,
std::vector<std::vector<Tensor*>>* vec_outputs);
////////////////////// Kernel Function (PD_PT_KERNEL) ////////////////////////
#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx) \
template <typename... Tail> \
struct CustomComputeCallHelper<const device_ctx&, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
static_assert(in_idx == 0, \
"Kernel's DeviceContext should appear before Inputs."); \
static_assert(vec_in_idx == 0, \
"Kernel's DeviceContext should appear before Inputs."); \
static_assert( \
attr_idx == 0, \
"Kernel's DeviceContext should appear before Attributes."); \
static_assert(out_idx == 0, \
"Kernel's DeviceContext should appear before Outputs."); \
static_assert(vec_out_idx == 0, \
"Kernel's DeviceContext should appear before Outputs."); \
const device_ctx& arg = static_cast<const device_ctx&>(dev_ctx); \
CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx + 1, \
in_idx, \
vec_in_idx, \
attr_idx, \
out_idx, \
vec_out_idx>( \
dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} \
}
#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type) \
template <typename... Tail> \
struct CustomComputeCallHelper<const tensor_type&, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
static_assert(attr_idx == 0, \
"Kernel's Input should appear before Attributes."); \
static_assert(out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
static_assert(vec_out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
const Tensor& arg = inputs[in_idx]; \
CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx, \
in_idx + 1, \
vec_in_idx, \
attr_idx, \
out_idx, \
vec_out_idx>( \
dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} \
}
#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
template <typename... Tail> \
struct CustomComputeCallHelper<const std::vector<tensor_type>&, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
static_assert(attr_idx == 0, \
"Kernel's Input should appear before Attributes."); \
static_assert(out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
static_assert(vec_out_idx == 0, \
"Kernel's Input should appear before Outputs."); \
const std::vector<Tensor>& arg = vec_inputs[vec_in_idx]; \
CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx, \
in_idx, \
vec_in_idx + 1, \
attr_idx, \
out_idx, \
vec_out_idx>( \
dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} \
}
#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \
template <typename... Tail> \
struct CustomComputeCallHelper<attr_type, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
static_assert(out_idx == 0, \
"Kernel's Attributes should appear before Outputs."); \
static_assert(vec_out_idx == 0, \
"Kernel's Attributes should appear before Outputs."); \
try { \
attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]); \
return CustomComputeCallHelper<Tail...>::template Compute< \
dev_ctx_idx, \
in_idx, \
vec_in_idx, \
attr_idx + 1, \
out_idx, \
vec_out_idx>(dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} catch (paddle::bad_any_cast&) { \
PD_THROW( \
"Attribute cast error in custom operator. Expected " #attr_type \
" value."); \
} \
} \
}
#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type) \
template <typename... Tail> \
struct CustomComputeCallHelper<tensor_type*, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
tensor_type* arg = (*outputs)[out_idx]; \
CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx, \
in_idx, \
vec_in_idx, \
attr_idx, \
out_idx + 1, \
vec_out_idx>( \
dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} \
}
#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \
template <typename... Tail> \
struct CustomComputeCallHelper<std::vector<tensor_type*>, Tail...> { \
template <int dev_ctx_idx, \
int in_idx, \
int vec_in_idx, \
int attr_idx, \
int out_idx, \
int vec_out_idx, \
typename... PreviousArgs> \
static void Compute(const DeviceContext& dev_ctx, \
const std::vector<Tensor>& inputs, \
const std::vector<std::vector<Tensor>>& vec_inputs, \
const std::vector<paddle::any>& attrs, \
std::vector<Tensor*>* outputs, \
std::vector<std::vector<Tensor*>>* vec_outputs, \
PreviousArgs... pargs) { \
std::vector<tensor_type*> arg = (*vec_outputs)[vec_out_idx]; \
CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx, \
in_idx, \
vec_in_idx, \
attr_idx, \
out_idx, \
vec_out_idx + 1>( \
dev_ctx, \
inputs, \
vec_inputs, \
attrs, \
outputs, \
vec_outputs, \
pargs..., \
arg); \
} \
}
template <typename T>
struct PtenTypeTag {};
template <typename F, F f>
struct CustomKernelFuncImpl;
template <typename Return,
typename DevCtx,
typename... Args,
Return (*impl_fn)(DevCtx, Args...)>
struct CustomKernelFuncImpl<Return (*)(DevCtx, Args...), impl_fn> {
static void Compute(const DeviceContext& dev_ctx,
const std::vector<Tensor>& inputs,
const std::vector<std::vector<Tensor>>& vec_inputs,
const std::vector<paddle::any>& attrs,
std::vector<Tensor*>* outputs,
std::vector<std::vector<Tensor*>>* vec_outputs) {
CustomComputeCallHelper<DevCtx, Args..., PtenTypeTag<int>>::
template Compute<0, 0, 0, 0, 0, 0>(
dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs);
}
// NOTE: Tensor in args is paddle::Tensor but not DenseTensor
static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
return impl_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
}
private:
template <typename... RemainingArgs>
struct CustomComputeCallHelper;
/* DeviceContext Helpers */
PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext);
/* Input Helpers */
PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor);
PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor);
/* Attribute Helpers */
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(pten::dtype::float16);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
/* Output Helpers */
PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor);
PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor);
// End: base template
template <typename T>
struct CustomComputeCallHelper<PtenTypeTag<T>> {
template <int dev_ctx_idx,
int in_idx,
int vec_in_idx,
int attr_idx,
int out_idx,
int vec_out_idx>
static void Compute(const DeviceContext& dev_ctx,
const std::vector<Tensor>& inputs,
const std::vector<std::vector<Tensor>>& vec_inputs,
const std::vector<paddle::any>& attrs,
std::vector<Tensor*>* outputs,
std::vector<std::vector<Tensor*>>* vec_outputs,
DevCtx device_ctx,
Args... args) {
return impl_fn(device_ctx, args...);
}
};
};
#define PD_PT_KERNEL(...) \
::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
#define PD_PT_VARIADIC_KERNEL(...) \
reinterpret_cast<void*>( \
&::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), \
&__VA_ARGS__>::VariadicCompute)
////////////////////// Op Kernel Info depended structs //////////////////////
// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily.
// TensorArgDef follows pten::TensorArgDef in kernel_factory.h, the
// difference is that custom_kernel needs extra `is_vector' to ensure we can
// deal with case like vector with only one element.
struct TensorArgDef {
pten::Backend backend;
pten::DataLayout layout;
pten::DataType dtype;
bool is_vector{false};
TensorArgDef(pten::Backend in_backend,
pten::DataLayout in_layout,
pten::DataType in_dtype,
bool is_vector = false)
: backend(in_backend),
layout(in_layout),
dtype(in_dtype),
is_vector(is_vector) {}
TensorArgDef& SetBackend(pten::Backend in_backend) {
backend = in_backend;
return *this;
}
TensorArgDef& SetDataLayout(pten::DataLayout in_layout) {
layout = in_layout;
return *this;
}
TensorArgDef& SetDataType(pten::DataType in_dtype) {
dtype = in_dtype;
return *this;
}
};
// AttributeArgDef follows pten::AttributeArgDef in kernel_factory.h
struct AttributeArgDef {
std::type_index type_index;
explicit AttributeArgDef(std::type_index type_index)
: type_index(type_index) {}
};
////////////////////// Op Kernel Info //////////////////////
// OpKernelInfo stores all info parsed from user kernel function, includes:
// 0. op_name and kernel key(backend, data_layout and data_type)
// 1. unified custom kernel function
// 2. variadic kernel function(use paddle::Tensor)
// 3. args info and user defined change for specific arg
class PADDLE_API OpKernelInfo {
public:
explicit OpKernelInfo(const std::string& op_name,
pten::Backend backend,
pten::DataLayout data_layout,
pten::DataType data_type)
: op_name_(op_name),
backend_(backend),
layout_(data_layout),
dtype_(data_type) {}
// format: PD_PT_KERNEL(...)
OpKernelInfo& SetKernelFn(CustomKernelFunc&& func);
// format: PD_PT_VARIADIC_KERNEL(...)
OpKernelInfo& SetVariadicKernelFn(void* func);
// for Args parsing and storing
void AppendInput(pten::Backend backend,
pten::DataLayout layout,
pten::DataType dtype,
bool is_vector = false) {
input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
}
void AppendOutput(pten::Backend backend,
pten::DataLayout layout,
pten::DataType dtype,
bool is_vector = false) {
output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
}
void AppendAttribute(std::type_index type_index) {
attribute_defs_.emplace_back(AttributeArgDef(type_index));
}
// for Args user-def function
TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); }
TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); }
const pten::Backend& GetBackend() const { return backend_; }
const pten::DataLayout& GetDataLayout() const { return layout_; }
const pten::DataType& GetDataType() const { return dtype_; }
private:
friend class framework::OpKernelInfoHelper;
// 1. op info
std::string op_name_;
// 2. kernel key info
pten::Backend backend_{pten::Backend::UNDEFINED};
pten::DataLayout layout_{pten::DataLayout::UNDEFINED};
pten::DataType dtype_{pten::DataType::UNDEFINED};
// 3. args info
paddle::SmallVector<TensorArgDef> input_defs_{{}};
paddle::SmallVector<TensorArgDef> output_defs_{{}};
paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
// 4. func info
CustomKernelFunc kernel_fn_{nullptr};
void* variadic_kernel_fn_{nullptr};
};
////////////////////// Op Kernel Args Parser //////////////////////
// Define CustomKernelArgsParseFunctor for args parsing
// We have to store parsed info into OpKernelInfo before
// mapping to pten::KernelArgsDef in pten::Kernel
template <typename Func>
struct CustomKernelArgsParseFunctor;
template <typename Return_, typename... Args_>
struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
using Args = std::tuple<Args_...>;
enum : std::size_t { Arity = sizeof...(Args_) };
using Indices = std::make_index_sequence<Arity>;
template <std::size_t Index>
using Arg = typename std::tuple_element<Index, Args>::type;
static void Parse(OpKernelInfo* op_kernel_info) {
const pten::Backend& backend = op_kernel_info->GetBackend();
const pten::DataLayout& layout = op_kernel_info->GetDataLayout();
const pten::DataType& dtype = op_kernel_info->GetDataType();
auto default_tensor_layout = pten::DataLayout::NCHW;
if (layout != pten::DataLayout::ANY) {
default_tensor_layout = layout;
}
auto args_type = ParseArgType(Indices{});
for (auto arg_type : args_type) {
if (arg_type == std::type_index(typeid(const CPUContext&))) {
// do nothing, skip context arg now
} else if (arg_type == std::type_index(typeid(const Tensor&))) {
op_kernel_info->AppendInput(backend, default_tensor_layout, dtype);
} else if (arg_type ==
std::type_index(typeid(const std::vector<Tensor>&))) {
op_kernel_info->AppendInput(
backend, default_tensor_layout, dtype, true);
} else if (arg_type == std::type_index(typeid(Tensor*))) {
op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype);
} else if (arg_type == std::type_index(typeid(std::vector<Tensor*>))) {
op_kernel_info->AppendOutput(
backend, default_tensor_layout, dtype, true);
} else {
op_kernel_info->AppendAttribute(arg_type);
}
}
}
private:
template <std::size_t... INDEX>
static std::vector<std::type_index> ParseArgType(
std::index_sequence<INDEX...>) {
return {std::type_index(typeid(Arg<INDEX>))...};
}
};
#define PD_PT_ARGS_PARSE(...) \
::paddle::CustomKernelArgsParseFunctor<decltype(&__VA_ARGS__)>::Parse
//////////////// Op Kernel Info Map /////////////////
// all user custom kernels information are stored in this map
class PADDLE_API OpKernelInfoMap {
public:
static OpKernelInfoMap& Instance() {
static OpKernelInfoMap g_custom_kernel_info_map;
return g_custom_kernel_info_map;
}
std::vector<OpKernelInfo>& operator[](const std::string& name);
const std::unordered_map<std::string, std::vector<OpKernelInfo>>& GetMap()
const;
private:
OpKernelInfoMap() = default;
std::unordered_map<std::string, std::vector<OpKernelInfo>> map_;
PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap);
};
//////////////// Op Kernel Info Builder /////////////////
// format: PD_PT_ARGS_PARSE(...)
using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info);
using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel);
class PADDLE_API OpKernelInfoBuilder {
public:
explicit OpKernelInfoBuilder(std::string&& op_name,
pten::Backend backend,
pten::DataLayout data_layout,
pten::DataType data_type);
OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func);
OpKernelInfoBuilder& SetVariadicKernelFn(void* func);
OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func);
OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func);
private:
// op name
std::string op_name_;
// kernel key info
pten::Backend backend_{pten::Backend::UNDEFINED};
pten::DataLayout layout_{pten::DataLayout::UNDEFINED};
pten::DataType dtype_{pten::DataType::UNDEFINED};
// ref current info ptr
OpKernelInfo* info_ptr_;
};
/////////////////////// Custom kernel register API /////////////////////////
// For inference: compile directly with framework
// Call after PD_REGISTER_KERNEL(...)
void RegisterAllCustomKernel();
// Using this api to load compiled custom kernel's dynamic library and
// register custom kernels
void LoadCustomKernelLib(const std::string& dso_name);
//////////////// Custom kernel register macro /////////////////
#define PD_BACKEND(arg__) pten::Backend::arg__
#define PD_DATALAYOUT(arg__) pten::DataLayout::arg__
#define PD_DATATYPE(arg__) pten::DataType::arg__
#define PD_REGISTER_KERNEL(name, backend, layout, dtype, func) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_kernel__##name##_##backend##_##layout##_##dtype, \
"PD_REGISTER_KERNEL must be called in global namespace."); \
void __PD_USER_args_def_##name##_##backend##_##layout_##dtype( \
::paddle::OpKernelInfo* op_kernel_info); \
static ::paddle::OpKernelInfoBuilder \
__op_kernel_info_##name##_##backend##_##layout##_##dtype = \
::paddle::OpKernelInfoBuilder(#name, \
PD_BACKEND(backend), \
PD_DATALAYOUT(layout), \
PD_DATATYPE(dtype)) \
.SetKernelFn(PD_PT_KERNEL(func)) \
.SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(func)) \
.ArgsParse(PD_PT_ARGS_PARSE(func)) \
.ArgsDef( \
&__PD_USER_args_def_##name##_##backend##_##layout_##dtype); \
void __PD_USER_args_def_##name##_##backend##_##layout_##dtype( \
::paddle::OpKernelInfo* kernel)
} // namespace paddle
......@@ -3,16 +3,17 @@ add_subdirectory(utils)
cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
if (WITH_GPU)
nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
elseif (WITH_ROCM)
hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
else()
cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
endif()
cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory)
cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor)
# forward api file
set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/api/ext/op_kernel_info.h"
#include "paddle/fluid/framework/custom_kernel.h"
namespace paddle {
////////////////////// Op Kernel Info //////////////////////
OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) {
kernel_fn_ = std::forward<CustomKernelFunc>(func);
return *this;
}
OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) {
variadic_kernel_fn_ = func;
return *this;
}
//////////////// Op Kernel Info Map /////////////////
std::vector<OpKernelInfo>& OpKernelInfoMap::operator[](
const std::string& name) {
return map_[name];
}
const std::unordered_map<std::string, std::vector<OpKernelInfo>>&
OpKernelInfoMap::GetMap() const {
return map_;
}
//////////////// Op Kernel Info Builder /////////////////
OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name,
pten::Backend backend,
pten::DataLayout data_layout,
pten::DataType data_type) {
// 1. member assign
op_name_ = std::forward<std::string>(op_name);
backend_ = backend;
layout_ = data_layout;
dtype_ = data_type;
// 2. info parse
auto& info_vector = OpKernelInfoMap::Instance()[op_name_];
auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_);
info_vector.emplace_back(std::move(op_kernel_info));
// 3. get current info ptr
info_ptr_ = &(info_vector.back());
}
OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) {
info_ptr_->SetKernelFn(std::forward<CustomKernelFunc>(func));
return *this;
}
OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) {
info_ptr_->SetVariadicKernelFn(func);
return *this;
}
OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse(
CustomKernelArgsParseFn func) {
func(this->info_ptr_);
return *this;
}
OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
func(this->info_ptr_);
return *this;
}
/////////////////////// Op register API /////////////////////////
// For inference: compile directly with framework
// Call after PD_REGISTER_KERNEL(...)
void RegisterAllCustomKernel() {
auto& op_kernel_info_map = OpKernelInfoMap::Instance();
framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
}
// Using this api to load compiled custom kernel's dynamic library and
// register custom kernels
void LoadCustomKernelLib(const std::string& dso_name) {
framework::LoadCustomKernelLib(dso_name);
}
} // namespace paddle
#ifdef __cplusplus
extern "C" {
#endif
// C-API to get global OpKernelInfoMap.
paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() {
return paddle::OpKernelInfoMap::Instance();
}
#ifdef __cplusplus
} // end extern "C"
#endif
......@@ -26,7 +26,7 @@ class KernelSignature;
class ArgumentMappingContext;
class InferMetaContext;
using KernelFn = void (*)(KernelContext* ctx);
using KernelFn = std::function<void(KernelContext* ctx)>;
using KernelArgsDefFn = void (*)(Kernel* kernel);
using KernelArgsParseFn = void (*)(const KernelKey& default_key,
KernelArgsDef* args_def);
......
......@@ -49,8 +49,6 @@ using DataLayout = paddle::experimental::DataLayout;
class KernelContext;
using KernelFn = void (*)(KernelContext* ctx);
class KernelKey {
public:
KernelKey() = default;
......
# for paddle test case
if(WITH_TESTING)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
endif()
cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
......@@ -371,6 +371,17 @@ if load_noavx:
raise e
def set_paddle_custom_device_lib_path(lib_path):
if os.environ.get('CUSTOM_DEVICE_ROOT', None) is not None:
# use setted environment value
return
if os.path.exists(lib_path):
# set CUSTOM_DEVICE_ROOT default path
os.environ['CUSTOM_DEVICE_ROOT'] = os.path.normpath(lib_path)
else:
os.environ['CUSTOM_DEVICE_ROOT'] = ''
# set paddle lib path
def set_paddle_lib_path():
site_dirs = site.getsitepackages() if hasattr(
......@@ -380,11 +391,15 @@ def set_paddle_lib_path():
lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
return
if hasattr(site, 'USER_SITE'):
lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
set_paddle_lib_path()
......@@ -9,5 +9,6 @@ endforeach()
add_subdirectory(unittests)
add_subdirectory(book)
add_subdirectory(custom_op)
add_subdirectory(custom_kernel)
set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py)
py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
namespace paddle {
namespace custom_kernel {
// Here we use dot <CPU, ANY, INT8> for test
// This test will fail when this kernel is supported in framework
template <typename T>
void Dot(const paddle::CPUContext& dev_ctx,
const paddle::Tensor& x,
const paddle::Tensor& y,
paddle::Tensor* out) {
auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
// Loop over the total N elements of both operands while sum-reducing every
// B pairs along the way where B is the dimension of the least ordered axis
auto shape = x.shape();
auto const N = x.numel();
auto const B = shape[shape.size() - 1];
for (int j = 0; j < N / B; j++) {
T ss = 0;
for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
z[j] = ss;
}
}
} // namespace custom_kernel
} // namespace paddle
PD_REGISTER_KERNEL(
dot, CPU, ALL_LAYOUT, INT8, paddle::custom_kernel::Dot<int8_t>) {
/* do some args define here
* the only param can be used is OpKernelInfo* kernel */
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
}
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddle.fluid import core
from distutils.sysconfig import get_python_lib
from distutils.core import setup, Extension
# cc flags
paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
if core.is_compiled_with_npu():
paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
# include path
site_packages_path = get_python_lib()
paddle_custom_kernel_include = [
os.path.join(site_packages_path, 'paddle', 'include'),
]
# libs path
paddle_custom_kernel_library_dir = [
os.path.join(site_packages_path, 'paddle', 'fluid'),
]
# libs
libs = [':core_avx.so']
if not core.has_avx_core and core.has_noavx_core:
libs = [':core_noavx.so']
custom_kernel_dot_module = Extension(
'custom_kernel_dot',
sources=['custom_kernel_dot.cc'],
include_dirs=paddle_custom_kernel_include,
library_dirs=paddle_custom_kernel_library_dir,
libraries=libs,
extra_compile_args=paddle_extra_compile_args)
setup(
name='custom_kernel_dot',
version='1.0',
description='custom kernel fot compiling',
ext_modules=[custom_kernel_dot_module])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import site
import unittest
import numpy as np
# use dot <CPU, ANY, INT8> as test case.
class TestCustomKernelDot(unittest.TestCase):
def setUp(self):
# compile so and set to current path
cur_dir = os.path.dirname(os.path.abspath(__file__))
# --inplace to place output so file to current dir
cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
cur_dir, sys.executable)
os.system(cmd)
# set environment for loading and registering compiled custom kernels
# only valid in current process
os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
def test_custom_kernel_dot_run(self):
# test dot run
x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
import paddle
paddle.set_device('cpu')
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
out = paddle.dot(x, y)
self.assertTrue(
np.array_equal(out.numpy(), result),
"custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
result))
def tearDown(self):
del os.environ['CUSTOM_DEVICE_ROOT']
if __name__ == '__main__':
if os.name == 'nt' or sys.platform.startswith('darwin'):
# only support Linux now
exit()
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import site
import unittest
import numpy as np
class TestCustomKernelLoad(unittest.TestCase):
def setUp(self):
# compile so and set to current path
cur_dir = os.path.dirname(os.path.abspath(__file__))
# --inplace to place output so file to current dir
cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
cur_dir, sys.executable)
os.system(cmd)
# get paddle lib path and place so
paddle_lib_path = ''
site_dirs = site.getsitepackages() if hasattr(
site, 'getsitepackages'
) else [x for x in sys.path if 'site-packages' in x]
for site_dir in site_dirs:
lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
if os.path.exists(lib_dir):
paddle_lib_path = lib_dir
break
if paddle_lib_path == '':
if hasattr(site, 'USER_SITE'):
lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
if os.path.exists(lib_dir):
paddle_lib_path = lib_dir
self.default_path = os.path.sep.join(
[paddle_lib_path, '..', '..', 'paddle-plugins'])
# copy so to defalut path
cmd = 'mkdir -p {} && cp ./*.so {}'.format(self.default_path,
self.default_path)
os.system(cmd) # wait
def test_custom_kernel_dot_load(self):
# test dot load
x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
import paddle
paddle.set_device('cpu')
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
out = paddle.dot(x, y)
self.assertTrue(
np.array_equal(out.numpy(), result),
"custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
result))
def tearDown(self):
cmd = 'rm -rf {}'.format(self.default_path)
os.system(cmd)
if __name__ == '__main__':
if os.name == 'nt' or sys.platform.startswith('darwin'):
# only support Linux now
exit()
unittest.main()
......@@ -573,7 +573,8 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) + # pten common headers
# For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
# to `paddle/pten/api/ext`,
['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])
if '${WITH_MKLDNN}' == 'ON':
headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册