未验证 提交 cb81befa 编写于 作者: H huangjiyi 提交者: GitHub

register fluid kerenls to phi [part 6.5] (#52882)

* update

* fix bug

* update

* fix bug
上级 bc91012f
......@@ -2483,21 +2483,32 @@ Scope* OperatorWithKernel::PrepareData(
}
std::unique_ptr<phi::KernelKey> new_expected_kernel_key = nullptr;
if (run_phi_kernel_ && in_def != nullptr &&
in_def->backend != phi::Backend::ALL_BACKEND) {
auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
if ((in_def->backend != tensor_backend &&
!(in_def->backend == phi::Backend::GPUDNN &&
tensor_backend == phi::Backend::GPU) &&
!(in_def->backend == phi::Backend::KPS &&
tensor_backend == phi::Backend::XPU) &&
!(in_def->backend == phi::Backend::ONEDNN &&
tensor_backend == phi::Backend::CPU)) ||
tensor_in->place().GetType() == AllocationType::GPUPINNED) {
new_expected_kernel_key =
std::make_unique<phi::KernelKey>(in_def->backend,
expected_kernel_key.layout(),
expected_kernel_key.dtype());
if (run_phi_kernel_) {
if (phi_kernel_->GetKernelRegisteredType() ==
phi::KernelRegisteredType::STRUCTURE) {
if (!backends_are_same_class(kernel_type_for_var.backend(),
expected_kernel_key.backend())) {
new_expected_kernel_key =
std::make_unique<phi::KernelKey>(expected_kernel_key.backend(),
expected_kernel_key.layout(),
expected_kernel_key.dtype());
}
} else if (in_def != nullptr &&
in_def->backend != phi::Backend::ALL_BACKEND) {
auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
if ((in_def->backend != tensor_backend &&
!(in_def->backend == phi::Backend::GPUDNN &&
tensor_backend == phi::Backend::GPU) &&
!(in_def->backend == phi::Backend::KPS &&
tensor_backend == phi::Backend::XPU) &&
!(in_def->backend == phi::Backend::ONEDNN &&
tensor_backend == phi::Backend::CPU)) ||
tensor_in->place().GetType() == AllocationType::GPUPINNED) {
new_expected_kernel_key =
std::make_unique<phi::KernelKey>(in_def->backend,
expected_kernel_key.layout(),
expected_kernel_key.dtype());
}
}
}
......
......@@ -73,6 +73,8 @@ class MpAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
DECLARE_INPLACE_OP_INFERER(MpAllReduceSumInplaceInferer, {"X", "Out"});
DEFINE_C_ALLREDUCE_CPU_KERNEL(MpAllReduceSum, kRedSum);
} // namespace operators
} // namespace paddle
......@@ -86,9 +88,12 @@ REGISTER_OPERATOR(mp_allreduce_sum,
ops::MpAllReduceSumOpMaker,
ops::MpAllReduceSumInplaceInferer);
REGISTER_OP_CPU_KERNEL(mp_allreduce_sum,
ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
CPU,
ALL_LAYOUT,
ops::MpAllReduceSumCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
......@@ -15,16 +15,24 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace operators {
DEFINE_C_ALLREDUCE_CUDA_KERNEL(MpAllReduceSum, kRedSum)
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
mp_allreduce_sum,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
GPU,
ALL_LAYOUT,
ops::MpAllReduceSumCUDAKernel,
float,
double,
int,
int64_t,
#if NCCL_VERSION_CODE >= 21000
ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
plat::bfloat16,
#endif
ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
plat::float16) {
}
......@@ -114,6 +114,5 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(iou_similarity,
ops::IOUSimilarityKernel<phi::CPUContext, float>,
ops::IOUSimilarityKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(
iou_similarity, CPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
......@@ -15,6 +15,5 @@ limitations under the License. */
#include "paddle/fluid/operators/detection/iou_similarity_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(iou_similarity,
ops::IOUSimilarityKernel<phi::GPUContext, float>,
ops::IOUSimilarityKernel<phi::GPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(
iou_similarity, GPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
......@@ -105,7 +105,7 @@ struct IOUSimilarityFunctor {
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class IOUSimilarityKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -160,7 +160,7 @@ void GetMaxScoreIndexWithLocalityAware(
}
}
template <typename T>
template <typename T, typename DeviceContext>
class LocalityAwareNMSKernel : public framework::OpKernel<T> {
public:
void LocalityAwareNMSFast(phi::DenseTensor* bbox,
......@@ -520,6 +520,9 @@ REGISTER_OPERATOR(
ops::LocalityAwareNMSOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(locality_aware_nms,
ops::LocalityAwareNMSKernel<float>,
ops::LocalityAwareNMSKernel<double>);
PD_REGISTER_STRUCT_KERNEL(locality_aware_nms,
CPU,
ALL_LAYOUT,
ops::LocalityAwareNMSKernel,
float,
double) {}
......@@ -49,7 +49,7 @@ inline MiningType GetMiningType(std::string str) {
}
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class MineHardExamplesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -403,6 +403,9 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(mine_hard_examples,
ops::MineHardExamplesKernel<phi::CPUContext, float>,
ops::MineHardExamplesKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(mine_hard_examples,
CPU,
ALL_LAYOUT,
ops::MineHardExamplesKernel,
float,
double) {}
......@@ -143,7 +143,7 @@ void SliceOneClass(const platform::DeviceContext& ctx,
}
}
template <typename T>
template <typename T, typename DeviceContext>
class MultiClassNMSKernel : public framework::OpKernel<T> {
public:
void NMSFast(const phi::DenseTensor& bbox,
......@@ -629,6 +629,9 @@ class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
}
};
template <typename T, typename DeviceContext>
class MultiClassNMS2Kernel : public MultiClassNMSKernel<T, DeviceContext> {};
} // namespace operators
} // namespace paddle
......@@ -643,18 +646,21 @@ REGISTER_OPERATOR(
ops::MultiClassNMSOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(multiclass_nms,
ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
PD_REGISTER_STRUCT_KERNEL(
multiclass_nms, CPU, ALL_LAYOUT, ops::MultiClassNMSKernel, float, double) {}
REGISTER_OPERATOR(
multiclass_nms2,
ops::MultiClassNMS2Op,
ops::MultiClassNMS2OpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(multiclass_nms2,
ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
PD_REGISTER_STRUCT_KERNEL(multiclass_nms2,
CPU,
ALL_LAYOUT,
ops::MultiClassNMS2Kernel,
float,
double) {}
REGISTER_OPERATOR(
multiclass_nms3,
......
......@@ -270,7 +270,7 @@ __global__ void broadcast_batch_head_number(const T *src,
}
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......@@ -423,12 +423,15 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
REGISTER_OP_CUDA_KERNEL(
multihead_matmul,
ops::MultiHeadMatMulV2Kernel<phi::GPUContext, paddle::platform::float16>,
ops::MultiHeadMatMulV2Kernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(multihead_matmul,
GPU,
ALL_LAYOUT,
ops::MultiHeadMatMulV2Kernel,
float,
plat::float16) {}
#else
REGISTER_OP_CUDA_KERNEL(multihead_matmul,
ops::MultiHeadMatMulV2Kernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(
multihead_matmul, GPU, ALL_LAYOUT, ops::MultiHeadMatMulV2Kernel, float) {}
#endif
......@@ -240,7 +240,7 @@ void MatchMatrixTensorOpMaker::Make() {
)DOC");
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -321,7 +321,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -458,10 +458,13 @@ REGISTER_OPERATOR(
ops::MatchMatrixTensorGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);
REGISTER_OP_CPU_KERNEL(
match_matrix_tensor,
ops::CPUMatchMatrixTensorOPKernel<phi::CPUContext, float>);
REGISTER_OP_CPU_KERNEL(
match_matrix_tensor_grad,
ops::CPUMatchMatrixTensorOPGradKernel<phi::CPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(match_matrix_tensor,
CPU,
ALL_LAYOUT,
ops::CPUMatchMatrixTensorOPKernel,
float) {}
PD_REGISTER_STRUCT_KERNEL(match_matrix_tensor_grad,
CPU,
ALL_LAYOUT,
ops::CPUMatchMatrixTensorOPGradKernel,
float) {}
......@@ -107,7 +107,6 @@ REGISTER_OPERATOR(
ops::MeanIoUOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(mean_iou,
ops::MeanIoUKernel<int>,
ops::MeanIoUKernel<int32_t>,
ops::MeanIoUKernel<int64_t>);
PD_REGISTER_STRUCT_KERNEL(
mean_iou, CPU, ALL_LAYOUT, ops::MeanIoUKernel, int, int64_t) {}
......@@ -88,7 +88,7 @@ __global__ void ComputeIoUCUDAKernel(
}
}
template <typename T>
template <typename T, typename DeviceContext>
class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -166,7 +166,5 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(mean_iou,
ops::MeanIoUCUDAOpKernel<int>,
ops::MeanIoUCUDAOpKernel<int64_t>,
ops::MeanIoUCUDAOpKernel<int32_t>);
PD_REGISTER_STRUCT_KERNEL(
mean_iou, GPU, ALL_LAYOUT, ops::MeanIoUCUDAOpKernel, int, int64_t) {}
......@@ -27,7 +27,7 @@ template <typename T,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T>
template <typename T, typename DeviceContext>
class MeanIoUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -155,6 +155,8 @@ REGISTER_OPERATOR(minus,
ops::MinusOpMaker,
ops::MinusGradDescMaker,
ops::MinusGradMaker);
REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel<phi::CPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
REGISTER_OP_CUDA_KERNEL(minus, ops::MinusKernel<phi::GPUContext, float>);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
#endif
......@@ -20,7 +20,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class MinusKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......
......@@ -233,6 +233,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::LarsMomentumOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(lars_momentum,
ops::LarsMomentumOpKernel<float>,
ops::LarsMomentumOpKernel<double>);
PD_REGISTER_STRUCT_KERNEL(
lars_momentum, CPU, ALL_LAYOUT, ops::LarsMomentumOpKernel, float, double) {}
......@@ -474,7 +474,7 @@ inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
is_amp);
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
using MT = MultiPrecisionType<T>;
......@@ -679,8 +679,11 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
lars_momentum,
ops::LarsMomentumOpCUDAKernel<phi::GPUContext, float>,
ops::LarsMomentumOpCUDAKernel<phi::GPUContext, double>,
ops::LarsMomentumOpCUDAKernel<phi::GPUContext, paddle::platform::float16>);
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(lars_momentum,
GPU,
ALL_LAYOUT,
ops::LarsMomentumOpCUDAKernel,
float,
double,
plat::float16) {}
......@@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class LarsMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -10,7 +10,6 @@ register_unity_group(
lars_momentum_op.cc
proximal_adagrad_op.cc
adam_op.cc
dgc_momentum_op.cc
proximal_gd_op.cc
decayed_adagrad_op.cc
adadelta_op.cc
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册