diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc deleted file mode 100644 index 5800e91e990fc051733433b6094944b87dded038..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/deformable_conv_op_xpu.cc +++ /dev/null @@ -1,338 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_XPU -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/xpu/xpu_header.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class DeformableConvXPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* offset = ctx.Input("Offset"); - auto* mask = ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - PADDLE_ENFORCE_EQ( - deformable_groups == 1, - true, - platform::errors::InvalidArgument(( - "XPU only support deformable_groups == 1 in deformable_conv op."))); - PADDLE_ENFORCE_EQ( - groups == 1, - true, - platform::errors::InvalidArgument( - ("XPU only support groups == 1 in deformable_conv op."))); - PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, - true, - platform::errors::InvalidArgument( - "Filter high and weight should less than 8 on xpu " - "in deformable_conv op.")); - - const int batch_size = static_cast(input->dims()[0]); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - const T* input_ptr = input->data(); - const T* filter_ptr = filter.data(); - const float* offset_ptr = offset->data(); - const float* mask_ptr = mask->data(); - T* output_prt = output->data(); - - // set zeros for d_table_data - const int zero = 0; - int r = xpu::constant( - dev_ctx.x_context(), output_prt, output->numel(), zero); - PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, - true, - platform::errors::External( - "XPU API return wrong value[%d], please check where " - "Baidu Kunlun Card is properly installed.", - r)); - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset->numel() / offset->dims()[0]; - int input_mask_dim = mask->numel() / mask->dims()[0]; - int output_dim = - output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; - std::vector ksize{static_cast(filter.dims()[2]), - static_cast(filter.dims()[3])}; - int n = im2col_step; - int c = input->dims()[1]; - int h = input->dims()[2]; - int w = input->dims()[3]; - int f = filter.dims()[0]; - - for (int i = 0; i < batch_size / im2col_step; ++i) { - int r = xpu::deformable_conv( - dev_ctx.x_context(), - input_ptr + i * im2col_step * input_dim, - filter_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, - output_prt + i * im2col_step * output_dim, - n, - c, - h, - w, - f, - ksize, - strides, - paddings, - dilations, - groups, - deformable_groups, - nullptr, - nullptr, - nullptr, - true); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU deformable_conv kernel return wrong value[%d].", r)); - } - } -}; - -template -class DeformableConvGradXPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* output_grad = - ctx.Input(framework::GradVarName("Output")); - Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); - Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); - Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); - Tensor* mask_grad = ctx.Output(framework::GradVarName("Mask")); - T* dx_data = nullptr; - T* dw_data = nullptr; - T* dmask_data = nullptr; - T* doffset_data = nullptr; - - if (input_grad != nullptr) { - input_grad->mutable_data(ctx.GetPlace()); - dx_data = input_grad->data(); - } - if (filter_grad != nullptr) { - filter_grad->mutable_data(ctx.GetPlace()); - dw_data = filter_grad->data(); - } - if (offset_grad != nullptr) { - offset_grad->mutable_data(ctx.GetPlace()); - doffset_data = offset_grad->data(); - } - if (mask_grad != nullptr) { - mask_grad->mutable_data(ctx.GetPlace()); - dmask_data = mask_grad->data(); - } - - const Tensor* input = ctx.Input("Input"); - Tensor offset = *ctx.Input("Offset"); - Tensor mask = *ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - - int groups = ctx.Attr("groups"); - int deformable_groups = ctx.Attr("deformable_groups"); - int im2col_step = ctx.Attr("im2col_step"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - - PADDLE_ENFORCE_EQ( - deformable_groups == 1, - true, - platform::errors::InvalidArgument(( - "XPU only support deformable_groups == 1 in deformable_conv op."))); - PADDLE_ENFORCE_EQ( - groups == 1, - true, - platform::errors::InvalidArgument( - ("XPU only support groups == 1 in deformable_conv op."))); - PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, - true, - platform::errors::InvalidArgument( - "Filter high and weight should less than 8 on xpu " - "in deformable_conv op.")); - - auto& dev_ctx = ctx.template device_context(); - const int batch_size = static_cast(input->dims()[0]); - std::vector output_shape_vec(phi::vectorize(output_grad->dims())); - const T* output_grad_ptr = output_grad->data(); - const T* input_ptr = input->data(); - const T* filter_ptr = filter.data(); - const float* offset_ptr = offset.data(); - const float* mask_ptr = mask.data(); - if (dx_data == nullptr) { - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&dx_data), - input->numel() * sizeof(T)), - XPU_SUCCESS, - platform::errors::ResourceExhausted("XPU has no enough memory")); - } - if (dw_data == nullptr) { - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&dw_data), - filter.numel() * sizeof(T)), - XPU_SUCCESS, - platform::errors::ResourceExhausted("XPU has no enough memory")); - } - if (doffset_data == nullptr) { - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&doffset_data), - offset.numel() * sizeof(T)), - XPU_SUCCESS, - platform::errors::ResourceExhausted("XPU has no enough memory")); - } - if (dmask_data == nullptr) { - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&dmask_data), - mask.numel() * sizeof(T)), - XPU_SUCCESS, - platform::errors::ResourceExhausted("XPU has no enough memory")); - } - - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask.numel() / mask.dims()[0]; - int output_dim = - output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; - std::vector ksize{static_cast(filter.dims()[2]), - static_cast(filter.dims()[3])}; - int n = im2col_step; - int c = input->dims()[1]; - int h = input->dims()[2]; - int w = input->dims()[3]; - int f = filter.dims()[0]; - - T* filter_grad_tmp = nullptr; - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&filter_grad_tmp), - filter_grad->numel() * sizeof(T)), - XPU_SUCCESS, - platform::errors::ResourceExhausted("XPU has no enough memory")); - - // set zeros for d_table_data - const int zero = 0; - int r_dx = - xpu::constant(dev_ctx.x_context(), dx_data, input->numel(), zero); - int r_dw = - xpu::constant(dev_ctx.x_context(), dw_data, filter.numel(), zero); - int r_doffset = xpu::constant( - dev_ctx.x_context(), doffset_data, offset.numel(), zero); - int r_dmask = - xpu::constant(dev_ctx.x_context(), dmask_data, mask.numel(), zero); - int r_filter = xpu::constant( - dev_ctx.x_context(), filter_grad_tmp, filter.numel(), zero); - auto ret = (r_dx == xpu::Error_t::SUCCESS) && (r_dx == r_dw) && - (r_dx == r_doffset) && (r_dx == r_dmask) && (r_dx == r_filter); - PADDLE_ENFORCE_EQ(ret, - true, - platform::errors::External( - "XPU API return wrong value, please check where " - "Baidu Kunlun Card is properly installed.")); - - for (int i = 0; i < batch_size / im2col_step; ++i) { - int r = xpu::deformable_conv_grad( - dev_ctx.x_context(), - input_ptr + i * im2col_step * input_dim, - filter_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, - output_grad_ptr + i * im2col_step * output_dim, - dx_data + i * im2col_step * input_dim, - filter_grad_tmp, - doffset_data + i * im2col_step * input_offset_dim, - dmask_data + i * im2col_step * input_mask_dim, - n, - c, - h, - w, - f, - ksize, - strides, - paddings, - dilations, - groups, - deformable_groups, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - true); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU deformable_conv_grad kernel return wrong value[%d].", r)); - r = baidu::xpu::api::add(dev_ctx.x_context(), - filter_grad_tmp, - dw_data, - dw_data, - filter.numel()); - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - platform::errors::External( - "XPU add kernel return wrong value[%d].", r)); - } - - dev_ctx.Wait(); - xpu_free(filter_grad_tmp); - if (input_grad == nullptr) { - xpu_free(dx_data); - } - if (filter_grad == nullptr) { - xpu_free(dw_data); - } - if (offset_grad == nullptr) { - xpu_free(doffset_data); - } - if (mask_grad == nullptr) { - xpu_free(dmask_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using XPUDeviceContext = paddle::platform::XPUDeviceContext; - -REGISTER_OP_XPU_KERNEL(deformable_conv, - ops::DeformableConvXPUKernel); -REGISTER_OP_XPU_KERNEL( - deformable_conv_grad, - ops::DeformableConvGradXPUKernel); - -#endif diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc deleted file mode 100644 index 5ba1f8b98fae8d8af5a5658a12e3e76a73ce497b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifdef PADDLE_WITH_XPU -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/device/xpu/xpu_header.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" -namespace paddle { -namespace operators { - -template -class MergedMomentumOpXPUKernel : public framework::OpKernel { - using XPUType = typename XPUTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - T mu = static_cast(ctx.Attr("mu")); - auto params = ctx.MultiInput("Param"); - auto params_out = ctx.MultiOutput("ParamOut"); - auto lr = ctx.Input("LearningRate"); - int op_num = params.size(); - auto velocity = ctx.MultiInput("Velocity"); - auto grad = ctx.MultiInput("Grad"); - auto velocity_out = ctx.MultiOutput("VelocityOut"); - auto use_nesterov = ctx.Attr("use_nesterov"); - auto regularization_method = - ctx.Attr>("regularization_method"); - auto regularization_coeff = - ctx.Attr>("regularization_coeff"); - PADDLE_ENFORCE_EQ(op_num, - params_out.size(), - platform::errors::InvalidArgument( - "The size of Output(ParamOut) must be equal to " - "Input(Param), but got the size of Output(ParamOut) " - "is %d, the size of Input(Param) is %d.", - params_out.size(), - op_num)); - PADDLE_ENFORCE_EQ(op_num, - velocity.size(), - platform::errors::InvalidArgument( - "The size of Output(Velocity) must be equal to " - "Input(Param), but got the size of Output(Velocity) " - "is %d, the size of Input(Param) is %d.", - velocity.size(), - op_num)); - PADDLE_ENFORCE_EQ( - op_num, - velocity_out.size(), - platform::errors::InvalidArgument( - "The size of Output(VelocityOut) must be equal to " - "Input(Param), but got the size of Output(VelocityOut) " - "is %d, the size of Input(Param) is %d.", - velocity_out.size(), - op_num)); - PADDLE_ENFORCE_EQ( - op_num, - grad.size(), - platform::errors::InvalidArgument( - "The size of Input(Grad) must be equal to Input(Param), but got " - "the size of Input(Grad) is %d, the size of Input(Param) is %d.", - grad.size(), - op_num)); - if (regularization_method.size() == 0) { - regularization_method.resize(op_num); - } - std::vector param_list(op_num); - std::vector velocity_list(op_num); - std::vector grad_list(op_num); - std::vector velocity_out_list(op_num); - std::vector param_out_list(op_num); - std::vector sizes(op_num); - std::vector l2_weight_decay(op_num); - if (op_num > 0) { - for (int j = 0; j < op_num; j++) { - param_list[j] = - reinterpret_cast(const_cast(params[j]->data())); - velocity_list[j] = - reinterpret_cast(const_cast(velocity[j]->data())); - grad_list[j] = - reinterpret_cast(const_cast(grad[j]->data())); - param_out_list[j] = - reinterpret_cast(params_out[j]->data()); - velocity_out_list[j] = - reinterpret_cast(velocity_out[j]->data()); - sizes[j] = static_cast(params[j]->numel()); - if (regularization_method[j] != "l2_decay") { - l2_weight_decay[j] = 0.0f; - } else { - l2_weight_decay[j] = static_cast(regularization_coeff[j]); - } - PADDLE_ENFORCE_EQ(params[j], - params_out[j], - platform::errors::InvalidArgument( - "The size of Input(Param) and Output(ParamOut) " - "must be the same Tensors.")); - PADDLE_ENFORCE_EQ( - velocity[j], - velocity_out[j], - platform::errors::InvalidArgument( - "The size of Input(velocity) and Output(velocity) " - "must be the same Tensors.")); - } - } else { - return; - } - auto& dev_ctx = ctx.template device_context(); - int r = xpu::merged_momentum(dev_ctx.x_context(), - param_list, - velocity_list, - grad_list, - param_out_list, - velocity_out_list, - l2_weight_decay, - sizes, - lr->data(), - mu, - use_nesterov); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "merged_momentum"); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - merged_momentum, - ops::MergedMomentumOpXPUKernel, - ops::MergedMomentumOpXPUKernel); -#endif diff --git a/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e354d4ebc49c5d12f536f742cb433a11fb6c4e81 --- /dev/null +++ b/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DeformableConvGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const paddle::optional& mask, + const DenseTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* dx, + DenseTensor* offset_grad, + DenseTensor* filter_grad, + DenseTensor* mask_grad) { + T* dx_data = nullptr; + T* dw_data = nullptr; + T* dmask_data = nullptr; + T* doffset_data = nullptr; + + if (dx != nullptr) { + dx_data = dev_ctx.template Alloc(dx); + } + if (filter_grad != nullptr) { + dw_data = dev_ctx.template Alloc(filter_grad); + } + if (offset_grad != nullptr) { + doffset_data = dev_ctx.template Alloc(offset_grad); + } + if (mask_grad != nullptr) { + dmask_data = dev_ctx.template Alloc(mask_grad); + } + + PADDLE_ENFORCE_EQ( + deformable_groups == 1, + true, + errors::InvalidArgument( + ("XPU only support deformable_groups == 1 in deformable_conv op."))); + PADDLE_ENFORCE_EQ( + groups == 1, + true, + errors::InvalidArgument( + ("XPU only support groups == 1 in deformable_conv op."))); + PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, + true, + errors::InvalidArgument( + "Filter high and weight should less than 8 on xpu " + "in deformable_conv op.")); + + const int batch_size = static_cast(x.dims()[0]); + std::vector output_shape_vec(phi::vectorize(out_grad.dims())); + const T* output_grad_ptr = out_grad.data(); + const T* input_ptr = x.data(); + const T* filter_ptr = filter.data(); + const float* offset_ptr = offset.data(); + const float* mask_ptr = mask->data(); + if (dx_data == nullptr) { + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dx_data), x.numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + } + if (dw_data == nullptr) { + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dw_data), + filter.numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + } + if (doffset_data == nullptr) { + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&doffset_data), + offset.numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + } + if (dmask_data == nullptr) { + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dmask_data), + mask->numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + } + + int input_dim = x.numel() / x.dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask->numel() / mask->dims()[0]; + int output_dim = + output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; + std::vector ksize{static_cast(filter.dims()[2]), + static_cast(filter.dims()[3])}; + int n = im2col_step; + int c = x.dims()[1]; + int h = x.dims()[2]; + int w = x.dims()[3]; + int f = filter.dims()[0]; + + T* filter_grad_tmp = nullptr; + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&filter_grad_tmp), + filter_grad->numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + + // set zeros for d_table_data + const int zero = 0; + int r_dx = xpu::constant(dev_ctx.x_context(), dx_data, x.numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r_dx, "constant"); + int r_dw = + xpu::constant(dev_ctx.x_context(), dw_data, filter.numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r_dw, "constant"); + int r_doffset = + xpu::constant(dev_ctx.x_context(), doffset_data, offset.numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r_doffset, "constant"); + int r_dmask = + xpu::constant(dev_ctx.x_context(), dmask_data, mask->numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r_dmask, "constant"); + int r_filter = xpu::constant( + dev_ctx.x_context(), filter_grad_tmp, filter.numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r_filter, "constant"); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + int r = xpu::deformable_conv_grad( + dev_ctx.x_context(), + input_ptr + i * im2col_step * input_dim, + filter_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, + output_grad_ptr + i * im2col_step * output_dim, + dx_data + i * im2col_step * input_dim, + filter_grad_tmp, + doffset_data + i * im2col_step * input_offset_dim, + dmask_data + i * im2col_step * input_mask_dim, + n, + c, + h, + w, + f, + ksize, + strides, + paddings, + dilations, + groups, + deformable_groups, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "deformable_conv_grad"); + + r = baidu::xpu::api::add( + dev_ctx.x_context(), filter_grad_tmp, dw_data, dw_data, filter.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } + + dev_ctx.Wait(); + xpu_free(filter_grad_tmp); + if (dx == nullptr) { + xpu_free(dx_data); + } + if (filter_grad == nullptr) { + xpu_free(dw_data); + } + if (offset_grad == nullptr) { + xpu_free(doffset_data); + } + if (mask_grad == nullptr) { + xpu_free(dmask_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv_grad, + XPU, + ALL_LAYOUT, + phi::DeformableConvGradKernel, + float) {} diff --git a/paddle/phi/kernels/xpu/deformable_conv_kernel.cc b/paddle/phi/kernels/xpu/deformable_conv_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..064114a7f70442f7532d4837cd830b8f6e6e97a0 --- /dev/null +++ b/paddle/phi/kernels/xpu/deformable_conv_kernel.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const paddle::optional& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out) { + dev_ctx.template Alloc(out); + + PADDLE_ENFORCE_EQ( + deformable_groups == 1, + true, + errors::InvalidArgument( + ("XPU only support deformable_groups == 1 in deformable_conv op."))); + PADDLE_ENFORCE_EQ( + groups == 1, + true, + errors::InvalidArgument( + ("XPU only support groups == 1 in deformable_conv op."))); + PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, + true, + errors::InvalidArgument( + "Filter high and weight should less than 8 on xpu " + "in deformable_conv op.")); + + const int batch_size = static_cast(x.dims()[0]); + std::vector output_shape_vec(phi::vectorize(out->dims())); + + const T* input_ptr = x.data(); + const T* filter_ptr = filter.data(); + const float* offset_ptr = offset.data(); + const float* mask_ptr = mask->data(); + T* output_prt = out->data(); + + // set zeros for d_table_data + const int zero = 0; + int r = xpu::constant(dev_ctx.x_context(), output_prt, out->numel(), zero); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + int input_dim = x.numel() / x.dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask->numel() / mask->dims()[0]; + int output_dim = + output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; + std::vector ksize{static_cast(filter.dims()[2]), + static_cast(filter.dims()[3])}; + int n = im2col_step; + int c = x.dims()[1]; + int h = x.dims()[2]; + int w = x.dims()[3]; + int f = filter.dims()[0]; + + for (int i = 0; i < batch_size / im2col_step; ++i) { + int r = xpu::deformable_conv( + dev_ctx.x_context(), + input_ptr + i * im2col_step * input_dim, + filter_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, + output_prt + i * im2col_step * output_dim, + n, + c, + h, + w, + f, + ksize, + strides, + paddings, + dilations, + groups, + deformable_groups, + nullptr, + nullptr, + nullptr, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "deformable_conv"); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + deformable_conv, XPU, ALL_LAYOUT, phi::DeformableConvKernel, float) {} diff --git a/paddle/phi/kernels/xpu/merged_momentum_kernel.cc b/paddle/phi/kernels/xpu/merged_momentum_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..788a3ea89382d0789aef1c825d0caf781ef24d02 --- /dev/null +++ b/paddle/phi/kernels/xpu/merged_momentum_kernel.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/phi/kernels/merged_momentum_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MergedMomentumKernel( + const Context& dev_ctx, + const std::vector& params, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, + float mu_in, + bool use_nesterov, + const std::vector& regularization_method, + const std::vector& regularization_coeff, + bool multi_precision, + float rescale_grad, + std::vector params_out, + std::vector velocity_out, + std::vector master_param_out) { + using XPUType = typename XPUTypeTrait::Type; + auto lr = learning_rate[0]; + T mu = static_cast(mu_in); + int op_num = params.size(); + PADDLE_ENFORCE_EQ(op_num, + params_out.size(), + errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), + op_num)); + PADDLE_ENFORCE_EQ(op_num, + velocity.size(), + errors::InvalidArgument( + "The size of Output(Velocity) must be equal to " + "Input(Param), but got the size of Output(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocity.size(), + op_num)); + PADDLE_ENFORCE_EQ(op_num, + velocity_out.size(), + errors::InvalidArgument( + "The size of Output(VelocityOut) must be equal to " + "Input(Param), but got the size of Output(VelocityOut) " + "is %d, the size of Input(Param) is %d.", + velocity_out.size(), + op_num)); + PADDLE_ENFORCE_EQ( + op_num, + grad.size(), + errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grad.size(), + op_num)); + std::vector param_list(op_num); + std::vector velocity_list(op_num); + std::vector grad_list(op_num); + std::vector velocity_out_list(op_num); + std::vector param_out_list(op_num); + std::vector sizes(op_num); + std::vector l2_weight_decay(op_num); + if (op_num > 0) { + for (int j = 0; j < op_num; j++) { + param_list[j] = + reinterpret_cast(const_cast(params[j]->data())); + velocity_list[j] = + reinterpret_cast(const_cast(velocity[j]->data())); + grad_list[j] = + reinterpret_cast(const_cast(grad[j]->data())); + param_out_list[j] = reinterpret_cast(params_out[j]->data()); + velocity_out_list[j] = + reinterpret_cast(velocity_out[j]->data()); + sizes[j] = static_cast(params[j]->numel()); + if (regularization_method[j] != "l2_decay") { + l2_weight_decay[j] = 0.0f; + } else { + l2_weight_decay[j] = static_cast(regularization_coeff[j]); + } + PADDLE_ENFORCE_EQ(params[j], + params_out[j], + errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + PADDLE_ENFORCE_EQ(velocity[j], + velocity_out[j], + errors::InvalidArgument( + "The size of Input(velocity) and Output(velocity) " + "must be the same Tensors.")); + } + } else { + return; + } + int r = xpu::merged_momentum(dev_ctx.x_context(), + param_list, + velocity_list, + grad_list, + param_out_list, + velocity_out_list, + l2_weight_decay, + sizes, + lr->data(), + mu, + use_nesterov); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merged_momentum"); +} + +} // namespace phi + +PD_REGISTER_KERNEL(merged_momentum, + XPU, + ALL_LAYOUT, + phi::MergedMomentumKernel, + float, + phi::dtype::float16) {}