/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; template class DeformableConvXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); auto* offset = ctx.Input("Offset"); auto* mask = ctx.Input("Mask"); Tensor filter = *ctx.Input("Filter"); Tensor* output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); const int groups = ctx.Attr("groups"); const int deformable_groups = ctx.Attr("deformable_groups"); const int im2col_step = ctx.Attr("im2col_step"); const std::vector strides = ctx.Attr>("strides"); const std::vector paddings = ctx.Attr>("paddings"); const std::vector dilations = ctx.Attr>("dilations"); PADDLE_ENFORCE_EQ( deformable_groups == 1, true, platform::errors::InvalidArgument(( "XPU only support deformable_groups == 1 in deformable_conv op."))); PADDLE_ENFORCE_EQ( groups == 1, true, platform::errors::InvalidArgument( ("XPU only support groups == 1 in deformable_conv op."))); PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, true, platform::errors::InvalidArgument( "Filter high and weight should less than 8 on xpu " "in deformable_conv op.")); const int batch_size = static_cast(input->dims()[0]); std::vector output_shape_vec(phi::vectorize(output->dims())); const T* input_ptr = input->data(); const T* filter_ptr = filter.data(); const float* offset_ptr = offset->data(); const float* mask_ptr = mask->data(); T* output_prt = output->data(); // set zeros for d_table_data const int zero = 0; int r = xpu::constant(dev_ctx.x_context(), output_prt, output->numel(), zero); PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, platform::errors::External( "XPU API return wrong value[%d], please check where " "Baidu Kunlun Card is properly installed.", r)); int input_dim = input->numel() / input->dims()[0]; int input_offset_dim = offset->numel() / offset->dims()[0]; int input_mask_dim = mask->numel() / mask->dims()[0]; int output_dim = output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; std::vector ksize{static_cast(filter.dims()[2]), static_cast(filter.dims()[3])}; int n = im2col_step; int c = input->dims()[1]; int h = input->dims()[2]; int w = input->dims()[3]; int f = filter.dims()[0]; for (int i = 0; i < batch_size / im2col_step; ++i) { int r = xpu::deformable_conv( dev_ctx.x_context(), input_ptr + i * im2col_step * input_dim, filter_ptr, offset_ptr + i * im2col_step * input_offset_dim, mask_ptr + i * im2col_step * input_mask_dim, output_prt + i * im2col_step * output_dim, n, c, h, w, f, ksize, strides, paddings, dilations, groups, deformable_groups, nullptr, nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( "XPU deformable_conv kernel return wrong value[%d].", r)); } } }; template class DeformableConvGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const Tensor* output_grad = ctx.Input(framework::GradVarName("Output")); Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); Tensor* mask_grad = ctx.Output(framework::GradVarName("Mask")); T* dx_data = nullptr; T* dw_data = nullptr; T* dmask_data = nullptr; T* doffset_data = nullptr; if (input_grad != nullptr) { input_grad->mutable_data(ctx.GetPlace()); dx_data = input_grad->data(); } if (filter_grad != nullptr) { filter_grad->mutable_data(ctx.GetPlace()); dw_data = filter_grad->data(); } if (offset_grad != nullptr) { offset_grad->mutable_data(ctx.GetPlace()); doffset_data = offset_grad->data(); } if (mask_grad != nullptr) { mask_grad->mutable_data(ctx.GetPlace()); dmask_data = mask_grad->data(); } const Tensor* input = ctx.Input("Input"); Tensor offset = *ctx.Input("Offset"); Tensor mask = *ctx.Input("Mask"); Tensor filter = *ctx.Input("Filter"); int groups = ctx.Attr("groups"); int deformable_groups = ctx.Attr("deformable_groups"); int im2col_step = ctx.Attr("im2col_step"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); PADDLE_ENFORCE_EQ( deformable_groups == 1, true, platform::errors::InvalidArgument(( "XPU only support deformable_groups == 1 in deformable_conv op."))); PADDLE_ENFORCE_EQ( groups == 1, true, platform::errors::InvalidArgument( ("XPU only support groups == 1 in deformable_conv op."))); PADDLE_ENFORCE_EQ(filter.dims()[2] <= 8 && filter.dims()[3] <= 8, true, platform::errors::InvalidArgument( "Filter high and weight should less than 8 on xpu " "in deformable_conv op.")); auto& dev_ctx = ctx.template device_context(); const int batch_size = static_cast(input->dims()[0]); std::vector output_shape_vec(phi::vectorize(output_grad->dims())); const T* output_grad_ptr = output_grad->data(); const T* input_ptr = input->data(); const T* filter_ptr = filter.data(); const float* offset_ptr = offset.data(); const float* mask_ptr = mask.data(); if (dx_data == nullptr) { PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dx_data), input->numel() * sizeof(T)), XPU_SUCCESS, platform::errors::ResourceExhausted( "XPU has no enough memory")); } if (dw_data == nullptr) { PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dw_data), filter.numel() * sizeof(T)), XPU_SUCCESS, platform::errors::ResourceExhausted( "XPU has no enough memory")); } if (doffset_data == nullptr) { PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&doffset_data), offset.numel() * sizeof(T)), XPU_SUCCESS, platform::errors::ResourceExhausted( "XPU has no enough memory")); } if (dmask_data == nullptr) { PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dmask_data), mask.numel() * sizeof(T)), XPU_SUCCESS, platform::errors::ResourceExhausted( "XPU has no enough memory")); } int input_dim = input->numel() / input->dims()[0]; int input_offset_dim = offset.numel() / offset.dims()[0]; int input_mask_dim = mask.numel() / mask.dims()[0]; int output_dim = output_shape_vec[1] * output_shape_vec[2] * output_shape_vec[3]; std::vector ksize{static_cast(filter.dims()[2]), static_cast(filter.dims()[3])}; int n = im2col_step; int c = input->dims()[1]; int h = input->dims()[2]; int w = input->dims()[3]; int f = filter.dims()[0]; T* filter_grad_tmp = nullptr; PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&filter_grad_tmp), filter_grad->numel() * sizeof(T)), XPU_SUCCESS, platform::errors::ResourceExhausted( "XPU has no enough memory")); // set zeros for d_table_data const int zero = 0; int r_dx = xpu::constant(dev_ctx.x_context(), dx_data, input->numel(), zero); int r_dw = xpu::constant(dev_ctx.x_context(), dw_data, filter.numel(), zero); int r_doffset = xpu::constant(dev_ctx.x_context(), doffset_data, offset.numel(), zero); int r_dmask = xpu::constant(dev_ctx.x_context(), dmask_data, mask.numel(), zero); int r_filter = xpu::constant(dev_ctx.x_context(), filter_grad_tmp, filter.numel(), zero); auto ret = (r_dx == xpu::Error_t::SUCCESS) && (r_dx == r_dw) && (r_dx == r_doffset) && (r_dx == r_dmask) && (r_dx == r_filter); PADDLE_ENFORCE_EQ(ret, true, platform::errors::External( "XPU API return wrong value, please check where " "Baidu Kunlun Card is properly installed.")); for (int i = 0; i < batch_size / im2col_step; ++i) { int r = xpu::deformable_conv_grad( dev_ctx.x_context(), input_ptr + i * im2col_step * input_dim, filter_ptr, offset_ptr + i * im2col_step * input_offset_dim, mask_ptr + i * im2col_step * input_mask_dim, output_grad_ptr + i * im2col_step * output_dim, dx_data + i * im2col_step * input_dim, filter_grad_tmp, doffset_data + i * im2col_step * input_offset_dim, dmask_data + i * im2col_step * input_mask_dim, n, c, h, w, f, ksize, strides, paddings, dilations, groups, deformable_groups, nullptr, nullptr, nullptr, nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( "XPU deformable_conv_grad kernel return wrong value[%d].", r)); r = baidu::xpu::api::add(dev_ctx.x_context(), filter_grad_tmp, dw_data, dw_data, filter.numel()); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU add kernel return wrong value[%d].", r)); } dev_ctx.Wait(); xpu_free(filter_grad_tmp); if (input_grad == nullptr) { xpu_free(dx_data); } if (filter_grad == nullptr) { xpu_free(dw_data); } if (offset_grad == nullptr) { xpu_free(doffset_data); } if (mask_grad == nullptr) { xpu_free(dmask_data); } } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; using XPUDeviceContext = paddle::platform::XPUDeviceContext; REGISTER_OP_XPU_KERNEL(deformable_conv, ops::DeformableConvXPUKernel); REGISTER_OP_XPU_KERNEL( deformable_conv_grad, ops::DeformableConvGradXPUKernel); #endif