未验证 提交 eb38c85f 编写于 作者: H huangjiyi 提交者: GitHub

register fluid kerenls to phi [part 5] (#52486)

* update

* fix bug

* update

* fix bug
上级 5bac67d4
......@@ -116,6 +116,7 @@ class FetchV2Op : public framework::OperatorWithKernel {
}
};
template <typename T, typename DeviceContext>
class FetchV2Kernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
......@@ -228,28 +229,19 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL_FUNCTOR(fetch_v2,
float,
ops::FetchV2Kernel,
double,
ops::FetchV2Kernel,
int8_t,
ops::FetchV2Kernel,
uint8_t,
ops::FetchV2Kernel,
int,
ops::FetchV2Kernel,
int64_t,
ops::FetchV2Kernel,
bool,
ops::FetchV2Kernel,
paddle::platform::bfloat16,
ops::FetchV2Kernel,
paddle::platform::complex<float>,
ops::FetchV2Kernel,
paddle::platform::complex<double>,
ops::FetchV2Kernel,
plat::float16,
ops::FetchV2Kernel,
int16_t,
ops::FetchV2Kernel);
PD_REGISTER_STRUCT_KERNEL(fetch_v2,
CPU,
ALL_LAYOUT,
ops::FetchV2Kernel,
float,
double,
int,
int8_t,
int16_t,
int64_t,
uint8_t,
bool,
plat::float16,
plat::bfloat16,
plat::complex<float>,
plat::complex<double>) {}
......@@ -206,6 +206,6 @@ REGISTER_OPERATOR(
ops::FCOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(fc,
ops::FCOpKernel<phi::CPUContext, float>,
ops::FCOpKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(fc, CPU, ALL_LAYOUT, ops::FCOpKernel, float, double) {
}
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/operators/fc_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fc,
ops::FCOpKernel<phi::GPUContext, phi::dtype::float16>,
ops::FCOpKernel<phi::GPUContext, float>,
ops::FCOpKernel<phi::GPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(
fc, GPU, ALL_LAYOUT, ops::FCOpKernel, float, double, phi::dtype::float16) {}
......@@ -51,7 +51,7 @@ inline void FCOutputSize(const framework::DDim& in_dims,
out_dims.push_back(w_dims1);
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FCOpKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
......
......@@ -80,6 +80,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(FillZerosLikeOp2NoNeedBufferVarsInferer,
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like,
ops::FillZerosLikeOp,
ops::FillZerosLikeOpMaker);
......@@ -92,24 +94,26 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
fill_zeros_like,
ops::FillZerosLikeKernel<phi::CPUContext, int>,
ops::FillZerosLikeKernel<phi::CPUContext, int64_t>,
ops::FillZerosLikeKernel<phi::CPUContext, float>,
ops::FillZerosLikeKernel<phi::CPUContext, double>,
ops::FillZerosLikeKernel<phi::CPUContext, bool>,
ops::FillZerosLikeKernel<phi::CPUContext, paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<phi::CPUContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
fill_zeros_like2,
ops::FillZerosLikeKernel<phi::CPUContext, int>,
ops::FillZerosLikeKernel<phi::CPUContext, int64_t>,
ops::FillZerosLikeKernel<phi::CPUContext, float>,
ops::FillZerosLikeKernel<phi::CPUContext, double>,
ops::FillZerosLikeKernel<phi::CPUContext, bool>,
ops::FillZerosLikeKernel<phi::CPUContext, paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<phi::CPUContext,
paddle::platform::complex<double>>);
PD_REGISTER_STRUCT_KERNEL(fill_zeros_like,
CPU,
ALL_LAYOUT,
ops::FillZerosLikeKernel,
int,
int64_t,
float,
double,
bool,
plat::complex<float>,
plat::complex<double>) {}
PD_REGISTER_STRUCT_KERNEL(fill_zeros_like2,
CPU,
ALL_LAYOUT,
ops::FillZerosLikeKernel2,
int,
int64_t,
float,
double,
bool,
plat::complex<float>,
plat::complex<double>) {}
......@@ -19,26 +19,30 @@ limitations under the License. */
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fill_zeros_like,
ops::FillZerosLikeKernel<phi::GPUContext, int>,
ops::FillZerosLikeKernel<phi::GPUContext, int64_t>,
ops::FillZerosLikeKernel<phi::GPUContext, float>,
ops::FillZerosLikeKernel<phi::GPUContext, double>,
ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::float16>,
ops::FillZerosLikeKernel<phi::GPUContext, bool>,
ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<phi::GPUContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
fill_zeros_like2,
ops::FillZerosLikeKernel<phi::GPUContext, int>,
ops::FillZerosLikeKernel<phi::GPUContext, int64_t>,
ops::FillZerosLikeKernel<phi::GPUContext, float>,
ops::FillZerosLikeKernel<phi::GPUContext, double>,
ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::float16>,
ops::FillZerosLikeKernel<phi::GPUContext, bool>,
ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<phi::GPUContext,
paddle::platform::complex<double>>);
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(fill_zeros_like,
GPU,
ALL_LAYOUT,
ops::FillZerosLikeKernel,
int,
int64_t,
float,
double,
plat::float16,
bool,
plat::complex<float>,
plat::complex<double>) {}
PD_REGISTER_STRUCT_KERNEL(fill_zeros_like2,
GPU,
ALL_LAYOUT,
ops::FillZerosLikeKernel2,
int,
int64_t,
float,
double,
plat::float16,
bool,
plat::complex<float>,
plat::complex<double>) {}
......@@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FillZerosLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -33,5 +33,8 @@ class FillZerosLikeKernel : public framework::OpKernel<T> {
}
};
template <typename T, typename DeviceContext>
class FillZerosLikeKernel2 : public FillZerosLikeKernel<T, DeviceContext> {};
} // namespace operators
} // namespace paddle
......@@ -162,14 +162,20 @@ REGISTER_OPERATOR(filter_by_instag,
REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad);
REGISTER_OP_CPU_KERNEL(filter_by_instag,
ops::FilterByInstagKernel<float>,
ops::FilterByInstagKernel<double>,
ops::FilterByInstagKernel<int32_t>,
ops::FilterByInstagKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(filter_by_instag_grad,
ops::FilterByInstagGradKernel<float>,
ops::FilterByInstagGradKernel<double>,
ops::FilterByInstagGradKernel<int32_t>,
ops::FilterByInstagGradKernel<int64_t>);
PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
CPU,
ALL_LAYOUT,
ops::FilterByInstagKernel,
float,
double,
int32_t,
int64_t) {}
PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
CPU,
ALL_LAYOUT,
ops::FilterByInstagGradKernel,
float,
double,
int32_t,
int64_t) {}
......@@ -325,7 +325,7 @@ __global__ void copy_grad_kernel(const size_t N,
#endif
template <typename T>
template <typename T, typename DeviceContext>
class FilterByInstagGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -553,7 +553,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -620,14 +620,20 @@ class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(filter_by_instag,
ops::FilterByInstagGPUKernel<float>,
ops::FilterByInstagGPUKernel<double>,
ops::FilterByInstagGPUKernel<int32_t>,
ops::FilterByInstagGPUKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
ops::FilterByInstagGradGPUKernel<float>,
ops::FilterByInstagGradGPUKernel<double>,
ops::FilterByInstagGradGPUKernel<int32_t>,
ops::FilterByInstagGradGPUKernel<int64_t>);
PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
GPU,
ALL_LAYOUT,
ops::FilterByInstagGPUKernel,
float,
double,
int32_t,
int64_t) {}
PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
GPU,
ALL_LAYOUT,
ops::FilterByInstagGradGPUKernel,
float,
double,
int32_t,
int64_t) {}
......@@ -34,7 +34,7 @@ using SelectedRows = phi::SelectedRows;
template <typename T>
using Vector = phi::Vector<T>;
template <typename T>
template <typename T, typename DeviceContext>
class FilterByInstagKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -191,7 +191,7 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FilterByInstagGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......
......@@ -164,9 +164,8 @@ REGISTER_OPERATOR(fsp,
ops::FSPGradOpMaker<paddle::framework::OpDesc>,
ops::FSPGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
REGISTER_OP_CPU_KERNEL(fsp,
ops::FSPOpKernel<phi::CPUContext, float>,
ops::FSPOpKernel<phi::CPUContext, double>);
REGISTER_OP_CPU_KERNEL(fsp_grad,
ops::FSPGradOpKernel<phi::CPUContext, float>,
ops::FSPGradOpKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(
fsp, CPU, ALL_LAYOUT, ops::FSPOpKernel, float, double) {}
PD_REGISTER_STRUCT_KERNEL(
fsp_grad, CPU, ALL_LAYOUT, ops::FSPGradOpKernel, float, double) {}
......@@ -16,10 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fsp,
ops::FSPOpKernel<phi::GPUContext, float>,
ops::FSPOpKernel<phi::GPUContext, double>);
REGISTER_OP_CUDA_KERNEL(fsp_grad,
ops::FSPGradOpKernel<phi::GPUContext, float>,
ops::FSPGradOpKernel<phi::GPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(
fsp, GPU, ALL_LAYOUT, ops::FSPOpKernel, float, double) {}
PD_REGISTER_STRUCT_KERNEL(
fsp_grad, GPU, ALL_LAYOUT, ops::FSPGradOpKernel, float, double) {}
......@@ -20,7 +20,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FSPOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -64,7 +64,7 @@ class FSPOpKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FSPGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......
......@@ -33,9 +33,11 @@ namespace platform = paddle::platform;
namespace op = paddle::operators;
USE_OP_ITSELF(batch_norm);
USE_OP_ITSELF(fused_bn_add_activation);
USE_OP_ITSELF(fused_bn_add_activation_grad);
PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
PD_DECLARE_KERNEL(fused_bn_add_activation, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(fused_bn_add_activation_grad, GPU, ALL_LAYOUT);
template <typename T>
void InitRandomTensor(const std::vector<int64_t> &dims,
......
......@@ -75,7 +75,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT
#endif
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedAttentionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -402,7 +402,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedAttentionGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -826,11 +826,18 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fused_attention,
ops::FusedAttentionOpKernel<float>,
ops::FusedAttentionOpKernel<double>,
ops::FusedAttentionOpKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(fused_attention_grad,
ops::FusedAttentionGradKernel<float>,
ops::FusedAttentionGradKernel<double>,
ops::FusedAttentionGradKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(fused_attention,
GPU,
ALL_LAYOUT,
ops::FusedAttentionOpKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_attention_grad,
GPU,
ALL_LAYOUT,
ops::FusedAttentionGradKernel,
float,
double,
plat::float16) {}
......@@ -25,7 +25,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -91,7 +91,7 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -176,12 +176,18 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fused_bias_dropout_residual_layer_norm,
ops::FusedBiasDropoutResidualLnOpKernel<float>,
ops::FusedBiasDropoutResidualLnOpKernel<double>,
ops::FusedBiasDropoutResidualLnOpKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_bias_dropout_residual_layer_norm_grad,
ops::FusedBiasDropoutResidualLnGradKernel<float>,
ops::FusedBiasDropoutResidualLnGradKernel<double>,
ops::FusedBiasDropoutResidualLnGradKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(fused_bias_dropout_residual_layer_norm,
GPU,
ALL_LAYOUT,
ops::FusedBiasDropoutResidualLnOpKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
GPU,
ALL_LAYOUT,
ops::FusedBiasDropoutResidualLnGradKernel,
float,
double,
plat::float16) {}
......@@ -36,10 +36,15 @@ template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
class FusedBatchNormActKernel<phi::GPUContext, T>
class FusedBatchNormActKernel<T, phi::GPUContext>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if CUDNN_VERSION < 7401
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_batch_norm_act operator is not supported on GPU "
"when CUDNN version < 7.4.1"));
#endif
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()),
true,
......@@ -231,10 +236,15 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
};
template <typename T>
class FusedBatchNormActGradKernel<phi::GPUContext, T>
class FusedBatchNormActGradKernel<T, phi::GPUContext>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if CUDNN_VERSION < 7401
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_batch_norm_act operator is not supported on GPU "
"when CUDNN version < 7.4.1"));
#endif
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()),
true,
......@@ -415,17 +425,19 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7401
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
fused_batch_norm_act,
ops::FusedBatchNormActKernel<phi::GPUContext, float>,
ops::FusedBatchNormActKernel<phi::GPUContext, double>,
ops::FusedBatchNormActKernel<phi::GPUContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_batch_norm_act_grad,
ops::FusedBatchNormActGradKernel<phi::GPUContext, float>,
ops::FusedBatchNormActGradKernel<phi::GPUContext, double>,
ops::FusedBatchNormActGradKernel<phi::GPUContext, plat::float16>);
#endif
PD_REGISTER_STRUCT_KERNEL(fused_batch_norm_act,
GPU,
ALL_LAYOUT,
ops::FusedBatchNormActKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_batch_norm_act_grad,
GPU,
ALL_LAYOUT,
ops::FusedBatchNormActGradKernel,
float,
double,
plat::float16) {}
......@@ -88,13 +88,13 @@ class FusedBatchNormActOpInferVarType
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedBatchNormActKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedBatchNormActGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
......
......@@ -36,10 +36,15 @@ template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
class FusedBatchNormAddActKernel<phi::GPUContext, T>
class FusedBatchNormAddActKernel<T, phi::GPUContext>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if CUDNN_VERSION < 7401
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_bn_add_activation operator is not supported on GPU "
"when CUDNN version < 7.4.1"));
#endif
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()),
true,
......@@ -208,10 +213,15 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
};
template <typename T>
class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
class FusedBatchNormAddActGradKernel<T, phi::GPUContext>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if CUDNN_VERSION < 7401
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_bn_add_activation operator is not supported on GPU "
"when CUDNN version < 7.4.1"));
#endif
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()),
true,
......@@ -362,13 +372,15 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7401
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
fused_bn_add_activation,
ops::FusedBatchNormAddActKernel<phi::GPUContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_bn_add_activation_grad,
ops::FusedBatchNormAddActGradKernel<phi::GPUContext, plat::float16>);
#endif
PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation,
GPU,
ALL_LAYOUT,
ops::FusedBatchNormAddActKernel,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation_grad,
GPU,
ALL_LAYOUT,
ops::FusedBatchNormAddActGradKernel,
plat::float16) {}
......@@ -89,13 +89,13 @@ class FusedBatchNormAddActOpInferVarType
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
......
......@@ -461,15 +461,19 @@ REGISTER_OPERATOR(
REGISTER_OPERATOR(fused_elemwise_activation_grad,
ops::FusedElemwiseActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_activation,
ops::FusedElemwiseActivationKernel<phi::CPUContext, float>,
ops::FusedElemwiseActivationKernel<phi::CPUContext, double>);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_activation_grad,
ops::FusedElemwiseActivationGradKernel<phi::CPUContext, float>,
ops::FusedElemwiseActivationGradKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation,
CPU,
ALL_LAYOUT,
ops::FusedElemwiseActivationKernel,
float,
double) {}
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation_grad,
CPU,
ALL_LAYOUT,
ops::FusedElemwiseActivationGradKernel,
float,
double) {}
// for memory optimization, we register the fused_elemwise_add_activation OP
REGISTER_OPERATOR(
......@@ -482,12 +486,16 @@ REGISTER_OPERATOR(fused_elemwise_add_activation_grad,
ops::FusedElemwiseAddActivationNoNeddBufVarInferer,
ops::FusedElemwiseAddActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_add_activation,
ops::FusedElemwiseActivationKernel<phi::CPUContext, float>,
ops::FusedElemwiseActivationKernel<phi::CPUContext, double>);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_add_activation_grad,
ops::FusedElemwiseActivationGradKernel<phi::CPUContext, float>,
ops::FusedElemwiseActivationGradKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
CPU,
ALL_LAYOUT,
ops::FusedElemwiseAddActivationKernel,
float,
double) {}
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation_grad,
CPU,
ALL_LAYOUT,
ops::FusedElemwiseAddActivationGradKernel,
float,
double) {}
......@@ -15,30 +15,34 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_activation,
ops::FusedElemwiseActivationKernel<phi::GPUContext, float>,
ops::FusedElemwiseActivationKernel<phi::GPUContext, double>,
ops::FusedElemwiseActivationKernel<phi::GPUContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_activation_grad,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext, float>,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext, double>,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_add_activation,
ops::FusedElemwiseActivationKernel<phi::GPUContext, float>,
ops::FusedElemwiseActivationKernel<phi::GPUContext, double>,
ops::FusedElemwiseActivationKernel<phi::GPUContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_add_activation_grad,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext, float>,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext, double>,
ops::FusedElemwiseActivationGradKernel<phi::GPUContext,
paddle::platform::float16>);
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation,
GPU,
ALL_LAYOUT,
ops::FusedElemwiseActivationKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation_grad,
GPU,
ALL_LAYOUT,
ops::FusedElemwiseActivationGradKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
GPU,
ALL_LAYOUT,
ops::FusedElemwiseAddActivationKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation_grad,
GPU,
ALL_LAYOUT,
ops::FusedElemwiseAddActivationGradKernel,
float,
double,
plat::float16) {}
......@@ -616,7 +616,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
}
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -655,7 +655,7 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -765,5 +765,14 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
}
}
};
template <typename T, typename DeviceContext>
class FusedElemwiseAddActivationKernel
: public FusedElemwiseActivationKernel<T, DeviceContext> {};
template <typename T, typename DeviceContext>
class FusedElemwiseAddActivationGradKernel
: public FusedElemwiseActivationGradKernel<T, DeviceContext> {};
} // namespace operators
} // namespace paddle
......@@ -29,7 +29,7 @@
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......@@ -145,14 +145,18 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
REGISTER_OP_CUDA_KERNEL(
fused_embedding_eltwise_layernorm,
ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext, float>,
ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext,
paddle::platform::float16>);
PD_REGISTER_STRUCT_KERNEL(fused_embedding_eltwise_layernorm,
GPU,
ALL_LAYOUT,
ops::EmbeddingEltWiseLayerNormKernel,
float,
plat::float16) {}
#else
REGISTER_OP_CUDA_KERNEL(
fused_embedding_eltwise_layernorm,
ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(fused_embedding_eltwise_layernorm,
GPU,
ALL_LAYOUT,
ops::EmbeddingEltWiseLayerNormKernel,
float) {}
#endif
......@@ -270,7 +270,7 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
public:
#define INIT_VEC_FUNC \
......@@ -396,7 +396,6 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
GET_Ht(ct, gates, ht)
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = phi::CPUContext;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
......@@ -502,7 +501,6 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = phi::CPUContext;
INIT_BASE_INPUT_OUTPUT
if (ids->lod()[0].size() == 2) {
SeqCompute(ctx);
......@@ -682,6 +680,9 @@ REGISTER_OPERATOR(fused_embedding_fc_lstm,
ops::FusedEmbeddingFCLSTMOp,
ops::FusedEmbeddingFCLSTMOpMaker);
REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm,
ops::FusedEmbeddingFCLSTMKernel<float>,
ops::FusedEmbeddingFCLSTMKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fused_embedding_fc_lstm,
CPU,
ALL_LAYOUT,
ops::FusedEmbeddingFCLSTMKernel,
float,
double) {}
......@@ -201,9 +201,15 @@ REGISTER_OPERATOR(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolOpGrad,
ops::FusedEmbeddingSeqPoolOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool,
ops::FusedEmbeddingSeqPoolKernel<float>,
ops::FusedEmbeddingSeqPoolKernel<double>);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolGradKernel<float>,
ops::FusedEmbeddingSeqPoolGradKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fused_embedding_seq_pool,
CPU,
ALL_LAYOUT,
ops::FusedEmbeddingSeqPoolKernel,
float,
double) {}
PD_REGISTER_STRUCT_KERNEL(fused_embedding_seq_pool_grad,
CPU,
ALL_LAYOUT,
ops::FusedEmbeddingSeqPoolGradKernel,
float,
double) {}
......@@ -135,7 +135,7 @@ inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims,
return last_dim;
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......@@ -224,7 +224,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......
......@@ -374,7 +374,7 @@ void AddReluAddLayerNorm(gpuStream_t stream,
}
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -449,8 +449,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fused_fc_elementwise_layernorm,
ops::FusedFCElementwiseLayerNormOpKernel<phi::dtype::float16>,
ops::FusedFCElementwiseLayerNormOpKernel<float>,
ops::FusedFCElementwiseLayerNormOpKernel<double>);
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(fused_fc_elementwise_layernorm,
GPU,
ALL_LAYOUT,
ops::FusedFCElementwiseLayerNormOpKernel,
float,
double,
plat::float16) {}
......@@ -65,7 +65,7 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT
#endif
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedFeedForwardKernel : public framework::OpKernel<T> {
public:
void MatMul(const phi::GPUContext& ctx,
......@@ -301,7 +301,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
public:
void MatMulGrad(const phi::GPUContext& ctx,
......@@ -628,14 +628,19 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fused_feedforward,
ops::FusedFeedForwardKernel<phi::GPUContext, float>,
ops::FusedFeedForwardKernel<phi::GPUContext, double>,
ops::FusedFeedForwardKernel<phi::GPUContext, paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
fused_feedforward_grad,
ops::FusedFeedForwardGradKernel<phi::GPUContext, float>,
ops::FusedFeedForwardGradKernel<phi::GPUContext, double>,
ops::FusedFeedForwardGradKernel<phi::GPUContext,
paddle::platform::float16>);
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(fused_feedforward,
GPU,
ALL_LAYOUT,
ops::FusedFeedForwardKernel,
float,
double,
plat::float16) {}
PD_REGISTER_STRUCT_KERNEL(fused_feedforward_grad,
GPU,
ALL_LAYOUT,
ops::FusedFeedForwardGradKernel,
float,
double,
plat::float16) {}
......@@ -354,7 +354,7 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
use_fused_matmul_bias);
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -446,7 +446,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -565,23 +565,35 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
ops::FusedGateAttentionOpKernel<float>,
ops::FusedGateAttentionOpKernel<plat::float16>,
ops::FusedGateAttentionOpKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
ops::FusedGateAttentionGradKernel<float>,
ops::FusedGateAttentionGradKernel<plat::float16>,
ops::FusedGateAttentionGradKernel<plat::bfloat16>);
PD_REGISTER_STRUCT_KERNEL(fused_gate_attention,
GPU,
ALL_LAYOUT,
ops::FusedGateAttentionOpKernel,
float,
plat::float16,
plat::bfloat16) {}
PD_REGISTER_STRUCT_KERNEL(fused_gate_attention_grad,
GPU,
ALL_LAYOUT,
ops::FusedGateAttentionGradKernel,
float,
plat::float16,
plat::bfloat16) {}
#else
REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
ops::FusedGateAttentionOpKernel<float>,
ops::FusedGateAttentionOpKernel<double>,
ops::FusedGateAttentionOpKernel<plat::float16>,
ops::FusedGateAttentionOpKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
ops::FusedGateAttentionGradKernel<float>,
ops::FusedGateAttentionGradKernel<double>,
ops::FusedGateAttentionGradKernel<plat::float16>,
ops::FusedGateAttentionGradKernel<plat::bfloat16>);
PD_REGISTER_STRUCT_KERNEL(fused_gate_attention,
GPU,
ALL_LAYOUT,
ops::FusedGateAttentionOpKernel,
float,
double,
plat::float16,
plat::bfloat16) {}
PD_REGISTER_STRUCT_KERNEL(fused_gate_attention_grad,
GPU,
ALL_LAYOUT,
ops::FusedGateAttentionGradKernel,
float,
double,
plat::float16,
plat::bfloat16) {}
#endif
......@@ -61,10 +61,15 @@ phi::funcs::MatmulFusedType GetFwdFusedEpilogueType(
return fused_type;
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDA_VERSION < 11060
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_gemm_epilogue operator only support CUDA 11.6 "
"or higher version."));
#endif
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
......@@ -119,10 +124,15 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDA_VERSION < 11060
PADDLE_THROW(phi::errors::Unimplemented(
"The fused_gemm_epilogue operator only support CUDA 11.6 "
"or higher version."));
#endif
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
const phi::DenseTensor* dout = ctx.Input<phi::DenseTensor>("DOut");
......@@ -172,21 +182,21 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
#if CUDA_VERSION >= 11060
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fused_gemm_epilogue,
ops::FusedGemmEpilogueKernel<phi::GPUContext, float>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, double>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>,
ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
fused_gemm_epilogue_grad,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext, float>,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext, double>,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext,
paddle::platform::float16>,
ops::FusedGemmEpilogueGradKernel<phi::GPUContext,
paddle::platform::bfloat16>);
#endif
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(fused_gemm_epilogue,
GPU,
ALL_LAYOUT,
ops::FusedGemmEpilogueKernel,
float,
double,
plat::float16,
plat::bfloat16) {}
PD_REGISTER_STRUCT_KERNEL(fused_gemm_epilogue_grad,
GPU,
ALL_LAYOUT,
ops::FusedGemmEpilogueGradKernel,
float,
double,
plat::float16,
plat::bfloat16) {}
......@@ -18,7 +18,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -662,6 +662,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fused_multi_transformer_int8,
ops::FusedMultiTransformerINT8OpKernel<plat::float16>,
ops::FusedMultiTransformerINT8OpKernel<float>);
PD_REGISTER_STRUCT_KERNEL(fused_multi_transformer_int8,
GPU,
ALL_LAYOUT,
ops::FusedMultiTransformerINT8OpKernel,
float,
plat::float16) {}
......@@ -19,7 +19,7 @@ namespace operators {
#if CUDA_VERSION >= 11060 // Use cublasLt to fuse FFN operation.
template <typename T>
template <typename T, typename DeviceContext>
class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -685,7 +685,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
#else
template <typename T>
template <typename T, typename DeviceContext>
class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -1370,6 +1370,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fused_multi_transformer,
ops::FusedMultiTransformerOpKernel<plat::float16>,
ops::FusedMultiTransformerOpKernel<float>);
PD_REGISTER_STRUCT_KERNEL(fused_multi_transformer,
GPU,
ALL_LAYOUT,
ops::FusedMultiTransformerOpKernel,
float,
plat::float16) {}
......@@ -290,7 +290,13 @@ REGISTER_OPERATOR(fused_seqpool_cvm,
ops::FusedSeqpoolCVMGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(fused_seqpool_cvm_grad, ops::FusedSeqpoolCVMGradOp)
REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm,
ops::FusedSeqpoolCVMOpCPUKernel<float>)
REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm_grad,
ops::FusedSeqpoolCVMGradOpCPUKernel<float>)
PD_REGISTER_STRUCT_KERNEL(fused_seqpool_cvm,
CPU,
ALL_LAYOUT,
ops::FusedSeqpoolCVMOpCPUKernel,
float) {}
PD_REGISTER_STRUCT_KERNEL(fused_seqpool_cvm_grad,
CPU,
ALL_LAYOUT,
ops::FusedSeqpoolCVMGradOpCPUKernel,
float) {}
......@@ -420,7 +420,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
}
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -505,7 +505,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
......@@ -588,8 +588,11 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fused_seqpool_cvm,
ops::FusedSeqpoolCVMCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(fused_seqpool_cvm_grad,
ops::FusedSeqpoolCVMGradCUDAKernel<float>);
PD_REGISTER_STRUCT_KERNEL(
fused_seqpool_cvm, GPU, ALL_LAYOUT, ops::FusedSeqpoolCVMCUDAKernel, float) {
}
PD_REGISTER_STRUCT_KERNEL(fused_seqpool_cvm_grad,
GPU,
ALL_LAYOUT,
ops::FusedSeqpoolCVMGradCUDAKernel,
float) {}
......@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class FusedSeqpoolCVMOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -33,7 +33,7 @@ class FusedSeqpoolCVMOpCPUKernel : public framework::OpKernel<T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class FusedSeqpoolCVMGradOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -34,10 +34,15 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
template <typename T, typename DeviceContext>
class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDNN_VERSION < 7100
PADDLE_THROW(phi::errors::Unimplemented(
"The conv2d_inception_fusion operator is not supported on GPU "
"when CUDNN version < 7.1.0"));
#endif
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto filters = ctx.MultiInput<phi::DenseTensor>("Filter");
......@@ -336,9 +341,10 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7100
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
ops::CUDNNConvInceptionFusionOpKernel<float>,
ops::CUDNNConvInceptionFusionOpKernel<double>);
#endif
PD_REGISTER_STRUCT_KERNEL(conv2d_inception_fusion,
GPU,
ALL_LAYOUT,
ops::CUDNNConvInceptionFusionOpKernel,
float,
double) {}
......@@ -18,7 +18,10 @@ limitations under the License. */
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fusion_group,
ops::FusionGroupKernel<phi::GPUContext, float>,
ops::FusionGroupKernel<phi::GPUContext, double>,
ops::FusionGroupKernel<phi::GPUContext, plat::float16>);
PD_REGISTER_STRUCT_KERNEL(fusion_group,
GPU,
ALL_LAYOUT,
ops::FusionGroupKernel,
float,
double,
plat::float16) {}
......@@ -42,7 +42,7 @@ static void MutableMultiTypeData(std::vector<phi::DenseTensor*>* var,
}
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FusionGroupKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -234,4 +234,5 @@ void elementwise_cuda_kernel_0(size_t n, float *x, float* y, float* z) {
} // namespace operators
} // namespace paddle
USE_CUDA_ONLY_OP(fusion_group);
USE_OP_ITSELF(fusion_group);
PD_DECLARE_KERNEL(fusion_group, GPU, ALL_LAYOUT);
......@@ -249,7 +249,7 @@ more details can refer to GRU op.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionGRUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -303,7 +303,6 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T* xx_data = xx->mutable_data<T>(place)
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = phi::CPUContext;
INIT_BASE_DEFINES;
INIT_OTHER_DEFINES;
const int N = x_lod[0].size() - 1;
......@@ -394,7 +393,6 @@ class FusionGRUKernel : public framework::OpKernel<T> {
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = phi::CPUContext;
INIT_BASE_DEFINES;
if (x_lod[0].size() == 2) {
xx->Resize({total_T, D3});
......@@ -551,9 +549,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_gru,
ops::FusionGRUKernel<float>,
ops::FusionGRUKernel<double>);
PD_REGISTER_STRUCT_KERNEL(
fusion_gru, CPU, ALL_LAYOUT, ops::FusionGRUKernel, float, double) {}
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(fusion_gru)
......
......@@ -298,11 +298,10 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FuisonLSTMKernel : public framework::OpKernel<T> {
public:
#define INIT_BASE_DEFINES \
using DeviceContext = phi::CPUContext; \
auto* x = ctx.Input<phi::DenseTensor>("X"); \
auto* h0 = ctx.Input<phi::DenseTensor>("H0"); \
auto* c0 = ctx.Input<phi::DenseTensor>("C0"); \
......@@ -580,6 +579,5 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_lstm,
ops::FuisonLSTMKernel<float>,
ops::FuisonLSTMKernel<double>);
PD_REGISTER_STRUCT_KERNEL(
fusion_lstm, CPU, ALL_LAYOUT, ops::FuisonLSTMKernel, float, double) {}
......@@ -141,7 +141,7 @@ static void fc_relu(const T* x,
}
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -201,6 +201,9 @@ REGISTER_OPERATOR(fusion_repeated_fc_relu,
ops::FusionRepeatedFCReluOp,
ops::FusionRepeatedFCReluOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_repeated_fc_relu,
ops::FusionRepeatedFCReluKernel<float>,
ops::FusionRepeatedFCReluKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_repeated_fc_relu,
CPU,
ALL_LAYOUT,
ops::FusionRepeatedFCReluKernel,
float,
double) {}
......@@ -148,11 +148,10 @@ Fusion Sequence Conv and ElementwiseAdd Operator.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = phi::CPUContext;
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* w = ctx.Input<phi::DenseTensor>("Filter");
auto* b = ctx.Input<phi::DenseTensor>("Bias");
......@@ -283,6 +282,9 @@ REGISTER_OPERATOR(fusion_seqconv_eltadd_relu,
ops::FusionSeqConvEltAddReluOp,
ops::FusionSeqConvEltAddReluOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu,
ops::FusionSeqConvEltAddReluKernel<float>,
ops::FusionSeqConvEltAddReluKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_seqconv_eltadd_relu,
CPU,
ALL_LAYOUT,
ops::FusionSeqConvEltAddReluKernel,
float,
double) {}
......@@ -147,11 +147,10 @@ The concat axis should be 1.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = phi::CPUContext;
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
auto* w = ctx.Input<phi::DenseTensor>("FCWeight");
auto* b = ctx.Input<phi::DenseTensor>("FCBias");
......@@ -295,6 +294,9 @@ REGISTER_OPERATOR(fusion_seqexpand_concat_fc,
ops::FusionSeqExpandConcatFCOp,
ops::FusionSeqExpandConcatFCOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
ops::FusionSeqExpandConcatFCOpKernel<float>,
ops::FusionSeqExpandConcatFCOpKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_seqexpand_concat_fc,
CPU,
ALL_LAYOUT,
ops::FusionSeqExpandConcatFCOpKernel,
float,
double) {}
......@@ -92,7 +92,7 @@ Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -173,6 +173,9 @@ REGISTER_OPERATOR(fusion_seqpool_concat,
ops::FusionSeqPoolConcatOp,
ops::FusionSeqPoolConcatOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat,
ops::FusionSeqPoolConcatKernel<float>,
ops::FusionSeqPoolConcatKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_seqpool_concat,
CPU,
ALL_LAYOUT,
ops::FusionSeqPoolConcatKernel,
float,
double) {}
......@@ -96,7 +96,7 @@ Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -172,6 +172,9 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(fusion_seqpool_cvm_concat,
ops::FusionSeqPoolCVMConcatKernel<float>,
ops::FusionSeqPoolCVMConcatKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_seqpool_cvm_concat,
CPU,
ALL_LAYOUT,
ops::FusionSeqPoolCVMConcatKernel,
float,
double) {}
......@@ -84,7 +84,7 @@ void FusionSquaredMatSubOpMaker::Make() {
)DOC");
}
template <typename T>
template <typename T, typename DeviceContext>
class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -151,6 +151,9 @@ REGISTER_OPERATOR(fusion_squared_mat_sub,
ops::FusionSquaredMatSubOp,
ops::FusionSquaredMatSubOpMaker);
REGISTER_OP_CPU_KERNEL(fusion_squared_mat_sub,
ops::FusionSquaredMatSubKernel<float>,
ops::FusionSquaredMatSubKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_squared_mat_sub,
CPU,
ALL_LAYOUT,
ops::FusionSquaredMatSubKernel,
float,
double) {}
......@@ -24,7 +24,7 @@ namespace operators {
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
template <typename T, typename DeviceContext>
class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -119,6 +119,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat,
ops::TransposeFlattenConcatFusionKernel<float>,
ops::TransposeFlattenConcatFusionKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fusion_transpose_flatten_concat,
GPU,
ALL_LAYOUT,
ops::TransposeFlattenConcatFusionKernel,
float,
double) {}
......@@ -102,7 +102,10 @@ REGISTER_OPERATOR(
ops::SoftmaxMaskFuseUpperTriangleGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(fused_softmax_mask_upper_triangle_grad,
ops::SoftmaxMaskFuseUpperTriangleOpGrad);
REGISTER_OP_CPU_KERNEL(
fused_softmax_mask_upper_triangle,
ops::SoftmaxMaskFuseUpperTriangleCPUKernel<phi::CPUContext, float>,
ops::SoftmaxMaskFuseUpperTriangleCPUKernel<phi::CPUContext, double>);
PD_REGISTER_STRUCT_KERNEL(fused_softmax_mask_upper_triangle,
CPU,
ALL_LAYOUT,
ops::SoftmaxMaskFuseUpperTriangleCPUKernel,
float,
double) {}
......@@ -354,7 +354,7 @@ __global__ void SoftmaxMaskFuseUpperTriangleGradGPUKernel(const T* grad_input,
}
}
template <typename Place, typename T>
template <typename T, typename DeviceContext>
class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -386,7 +386,8 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
"received the last dimension of x is %d",
key_seq_len));
auto& place = *context.template device_context<Place>().eigen_device();
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto stream = context.cuda_device_context().stream();
int pow2_index = get_pow2_index_value(key_seq_len);
......@@ -470,7 +471,7 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
}
};
template <typename Place, typename T>
template <typename T, typename DeviceContext>
class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -491,7 +492,8 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
auto query_seq_len = y_dim[2];
auto key_seq_len = y_dim[3];
auto& place = *context.template device_context<Place>().eigen_device();
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto stream = context.cuda_device_context().stream();
int pow2_index = get_pow2_index_value(key_seq_len);
......@@ -602,14 +604,18 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
fused_softmax_mask_upper_triangle,
ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, plat::float16>,
ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, plat::bfloat16>,
ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, float>);
REGISTER_OP_CUDA_KERNEL(
fused_softmax_mask_upper_triangle_grad,
ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, plat::float16>,
ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext,
plat::bfloat16>,
ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(fused_softmax_mask_upper_triangle,
GPU,
ALL_LAYOUT,
ops::SoftmaxMaskFuseUpperTriangleKernel,
float,
plat::float16,
plat::bfloat16) {}
PD_REGISTER_STRUCT_KERNEL(fused_softmax_mask_upper_triangle_grad,
GPU,
ALL_LAYOUT,
ops::SoftmaxMaskFuseUpperTriangleGradKernel,
float,
plat::float16,
plat::bfloat16) {}
......@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class SoftmaxMaskFuseUpperTriangleCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -79,7 +79,7 @@ __global__ void MaximumFirst(T* mat, int num_raws, int num_cols, T max_value) {
}
}
template <typename T>
template <typename T, typename DeviceContext>
class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -283,6 +283,9 @@ class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fused_token_prune,
ops::FusedTokenPruneOpCUDAKernel<float>,
ops::FusedTokenPruneOpCUDAKernel<double>);
PD_REGISTER_STRUCT_KERNEL(fused_token_prune,
GPU,
ALL_LAYOUT,
ops::FusedTokenPruneOpCUDAKernel,
float,
double) {}
......@@ -156,4 +156,5 @@ The paper that proposed Follow The Regularized Leader (FTRL):
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
REGISTER_OP_CPU_KERNEL(ftrl, ops::FTRLOpKernel<phi::CPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(ftrl, CPU, ALL_LAYOUT, ops::FTRLOpKernel, float) {}
......@@ -13,4 +13,4 @@ specific language governing permissions and limitations under the License. */
#include "paddle/fluid/operators/optimizers/ftrl_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(ftrl, ops::FTRLOpKernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(ftrl, GPU, ALL_LAYOUT, ops::FTRLOpKernel, float) {}
......@@ -113,7 +113,7 @@ class SparseFTRLFunctor {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class FTRLOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -541,4 +541,5 @@ REGISTER_OPERATOR(faster_tokenizer,
ops::FasterTokenizerOp,
ops::FasterTokenizerOpMaker);
REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel<int64_t>);
PD_REGISTER_STRUCT_KERNEL(
faster_tokenizer, CPU, ALL_LAYOUT, ops::FasterTokenizerKernel, int64_t) {}
......@@ -122,7 +122,7 @@ class BertTokenizer {
InvVocab inv_vocab_;
};
template <typename T>
template <typename T, typename DeviceContext>
class FasterTokenizerKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册