// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; template class DepthwiseConvNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { // input const Tensor* input = context.Input("Input"); const Tensor* filter = context.Input("Filter"); // output Tensor* output = context.Output("Output"); output->mutable_data(context.GetPlace()); // attr const std::vector stride = context.Attr>("strides"); std::vector padding = context.Attr>("paddings"); std::vector dilation = context.Attr>("dilations"); const std::string data_format = context.Attr("data_format"); const std::string padding_algorithm = context.Attr("padding_algorithm"); // npu stream auto stream = context.template device_context().stream(); // check dimension const bool channel_last = data_format == "NHWC"; if (channel_last) { // NHWC PADDLE_ENFORCE_EQ( output->dims()[output->dims().size() - 1], input->dims()[input->dims().size() - 1], platform::errors::InvalidArgument( "ShapeError: The output channels must be equal to the " "input channels. But receivced output channel number is %d " "and input channel number is %d", output->dims()[output->dims().size() - 1], input->dims()[input->dims().size() - 1])); } else { // NCHW PADDLE_ENFORCE_EQ( output->dims()[1], input->dims()[1], platform::errors::InvalidArgument( "ShapeError: The output channels must be equal to the " "input channels. But receivced output channel number is %d " "and input channel number is %d", output->dims()[1], input->dims()[1])); } // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); // Transform filter (n, 1, h, w) --> (1, n, h, w) Tensor transformed_filter(filter->type()); transformed_filter.mutable_data({filter->dims()[1], filter->dims()[0], filter->dims()[2], filter->dims()[3]}, context.device_context().GetPlace()); std::vector perm = {1, 0, 2, 3}; const auto& runner_trans = NpuOpRunner( "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); runner_trans.Run(stream); // construct NPU attr std::vector strides(4, 1); std::vector dilations(4, 1); Tensor input_tensor, output_tensor; input_tensor.ShareDataWith(*input); output_tensor.ShareDataWith(*output); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_tensor.set_layout(DataLayout::kNHWC); strides[1] = stride[0]; strides[2] = stride[1]; dilations[1] = dilation[0]; dilations[2] = dilation[1]; } else { strides[2] = stride[0]; strides[3] = stride[1]; dilations[2] = dilation[0]; dilations[3] = dilation[1]; } // CANN OP const auto& runner = NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter}, {output_tensor}, {{"strides", strides}, {"dilations", dilations}, {"pads", padding}, {"data_format", data_format}}); runner.Run(stream); } }; template class DepthwiseConvGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { // input const Tensor* input = context.Input("Input"); const Tensor* filter = context.Input("Filter"); // output auto output_grad = context.Input(framework::GradVarName("Output")); auto input_grad = context.Output(framework::GradVarName("Input")); auto filter_grad = context.Output(framework::GradVarName("Filter")); // attr const std::vector stride = context.Attr>("strides"); std::vector padding = context.Attr>("paddings"); std::vector dilation = context.Attr>("dilations"); const std::string data_format = context.Attr("data_format"); const std::string padding_algorithm = context.Attr("padding_algorithm"); // npu stream auto stream = context.template device_context().stream(); // check dimension const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); // Transform filter (n, 1, h, w) --> (1, n, h, w) Tensor transformed_filter(filter->type()); transformed_filter.mutable_data({filter->dims()[1], filter->dims()[0], filter->dims()[2], filter->dims()[3]}, context.device_context().GetPlace()); std::vector perm = {1, 0, 2, 3}; const auto& runner_trans = NpuOpRunner( "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); runner_trans.Run(stream); // construct NPU attr std::vector strides(4, 1); std::vector dilations(4, 1); Tensor input_tensor, output_grad_tensor; input_tensor.ShareDataWith(*input); output_grad_tensor.ShareDataWith(*output_grad); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_grad_tensor.set_layout(DataLayout::kNHWC); strides[1] = stride[0]; strides[2] = stride[1]; dilations[1] = dilation[0]; dilations[2] = dilation[1]; } else { strides[2] = stride[0]; strides[3] = stride[1]; dilations[2] = dilation[0]; dilations[3] = dilation[1]; } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); std::vector filter_shape_vec = framework::vectorize(transformed_filter.dims()); const auto& runner = NpuOpRunner( "DepthwiseConv2DBackpropFilterD", {input_tensor, output_grad_tensor}, {*filter_grad}, {{"filter_size", filter_shape_vec}, {"strides", strides}, {"pads", padding}, {"dilations", dilations}, {"data_format", data_format}}); runner.Run(stream); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); std::vector input_shape_vec = framework::vectorize(input->dims()); Tensor input_grad_tensor; input_grad_tensor.ShareDataWith(*input_grad); if (channel_last) { input_grad_tensor.set_layout(DataLayout::kNHWC); } const auto& runner = NpuOpRunner("DepthwiseConv2DBackpropInputD", {transformed_filter, output_grad_tensor}, {input_grad_tensor}, {{"input_size", input_shape_vec}, {"strides", strides}, {"pads", padding}, {"dilations", dilations}, {"data_format", data_format}}); runner.Run(stream); } } }; template class NPUConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto& dev_ctx = ctx.template device_context(); const Tensor* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); Tensor input_tensor, output_tensor; input_tensor.ShareDataWith(*input); output_tensor.ShareDataWith(*output); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_tensor.set_layout(DataLayout::kNHWC); strides_vec[1] = strides[0]; strides_vec[2] = strides[1]; dilations_vec[1] = dilations[0]; dilations_vec[2] = dilations[1]; } else { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; } const auto& runner = NpuOpRunner("Conv2D", {input_tensor, *filter}, {output_tensor}, {{"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(dev_ctx.stream()); } }; template class NPUConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto& dev_ctx = ctx.template device_context(); auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); Tensor input_tensor, output_grad_tensor; input_tensor.ShareDataWith(*input); output_grad_tensor.ShareDataWith(*output_grad); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_grad_tensor.set_layout(DataLayout::kNHWC); strides_vec[1] = strides[0]; strides_vec[2] = strides[1]; dilations_vec[1] = dilations[0]; dilations_vec[2] = dilations[1]; } else { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; } if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = framework::vectorize(filter->dims()); const auto& runner = NpuOpRunner( "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor}, {*filter_grad}, {{"filter_size", filter_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(dev_ctx.stream()); } if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); std::vector input_shape_vec = framework::vectorize(input->dims()); Tensor input_grad_tensor; input_grad_tensor.ShareDataWith(*input_grad); if (channel_last) { input_grad_tensor.set_layout(DataLayout::kNHWC); } const auto& runner = NpuOpRunner("Conv2DBackpropInputD", {*filter, output_grad_tensor}, {input_grad_tensor}, {{"input_size", input_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(dev_ctx.stream()); } } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( depthwise_conv2d, ops::DepthwiseConvNPUKernel); REGISTER_OP_NPU_KERNEL( depthwise_conv2d_grad, ops::DepthwiseConvGradNPUKernel); REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel, ops::NPUConvOpKernel); REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel, ops::NPUConvGradOpKernel);