// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/operators/conv_op.h" namespace paddle { namespace operators { using NPUDeviceContext = platform::NPUDeviceContext; static void CastToFP16(const framework::ExecutionContext& ctx, const aclrtStream& stream, const phi::DenseTensor& in, phi::DenseTensor* out) { out->mutable_data(ctx.GetPlace()); NpuOpRunner runner; runner.SetType("Cast") .AddInput(in) .AddOutput(*out) .AddAttr("dst_type", ACL_FLOAT16) .Run(stream); } static void CastToFP32(const framework::ExecutionContext& ctx, const aclrtStream& stream, const phi::DenseTensor& in, phi::DenseTensor* out) { out->mutable_data(ctx.GetPlace()); NpuOpRunner runner; runner.SetType("Cast") .AddInput(in) .AddOutput(*out) .AddAttr("dst_type", ACL_FLOAT) .Run(stream); } template class DepthwiseConvNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const phi::DenseTensor* input = ctx.Input("Input"); const phi::DenseTensor* filter = ctx.Input("Filter"); phi::DenseTensor* output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); const std::vector stride = ctx.Attr>("strides"); std::vector padding = ctx.Attr>("paddings"); std::vector dilation = ctx.Attr>("dilations"); const std::string data_format = ctx.Attr("data_format"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const bool channel_last = data_format == "NHWC"; if (channel_last) { PADDLE_ENFORCE_EQ( output->dims()[output->dims().size() - 1], input->dims()[input->dims().size() - 1], platform::errors::InvalidArgument( "ShapeError: The output channels must be equal to the " "input channels. But receivced output channel number is %d " "and input channel number is %d", output->dims()[output->dims().size() - 1], input->dims()[input->dims().size() - 1])); } else { PADDLE_ENFORCE_EQ( output->dims()[1], input->dims()[1], platform::errors::InvalidArgument( "ShapeError: The output channels must be equal to the " "input channels. But receivced output channel number is %d " "and input channel number is %d", output->dims()[1], input->dims()[1])); } auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); std::vector strides(4, 1); std::vector dilations(4, 1); phi::DenseTensor input_tensor, output_tensor; input_tensor.ShareDataWith(*input); output_tensor.ShareDataWith(*output); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_tensor.set_layout(DataLayout::kNHWC); strides[1] = stride[0]; strides[2] = stride[1]; dilations[1] = dilation[0]; dilations[2] = dilation[1]; } else { strides[2] = stride[0]; strides[3] = stride[1]; dilations[2] = dilation[0]; dilations[3] = dilation[1]; } auto stream = ctx.template device_context().stream(); // Transform filter (n, 1, h, w) --> (1, n, h, w) phi::DenseTensor transformed_filter(filter->type()); transformed_filter.mutable_data({filter->dims()[1], filter->dims()[0], filter->dims()[2], filter->dims()[3]}, ctx.device_context().GetPlace()); std::vector perm = {1, 0, 2, 3}; const auto& runner_trans = NpuOpRunner( "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); runner_trans.Run(stream); const auto& runner = NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter}, {output_tensor}, {{"strides", strides}, {"dilations", dilations}, {"pads", padding}, {"data_format", data_format}}); runner.Run(stream); } }; template class DepthwiseConvGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const phi::DenseTensor* input = ctx.Input("Input"); const phi::DenseTensor* filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); const std::vector stride = ctx.Attr>("strides"); std::vector padding = ctx.Attr>("paddings"); std::vector dilation = ctx.Attr>("dilations"); const std::string data_format = ctx.Attr("data_format"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); auto stream = ctx.template device_context().stream(); // Transform filter (n, 1, h, w) --> (1, n, h, w) phi::DenseTensor transformed_filter(filter->type()); transformed_filter.mutable_data({filter->dims()[1], filter->dims()[0], filter->dims()[2], filter->dims()[3]}, ctx.device_context().GetPlace()); std::vector perm = {1, 0, 2, 3}; const auto& runner_trans = NpuOpRunner( "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); runner_trans.Run(stream); // construct NPU attr std::vector strides(4, 1); std::vector dilations(4, 1); phi::DenseTensor input_tensor, output_grad_tensor; input_tensor.ShareDataWith(*input); output_grad_tensor.ShareDataWith(*output_grad); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_grad_tensor.set_layout(DataLayout::kNHWC); strides[1] = stride[0]; strides[2] = stride[1]; dilations[1] = dilation[0]; dilations[2] = dilation[1]; } else { strides[2] = stride[0]; strides[3] = stride[1]; dilations[2] = dilation[0]; dilations[3] = dilation[1]; } if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); PADDLE_ENFORCE_EQ( (dilations[2] == 1 && dilations[3] == 1), true, platform::errors::InvalidArgument( "dilation_h and dilation_w in DepthwiseConv2DBackpropFilterD " "must be equal to 1, but got dilation_h %d, dilation_w %d", dilation[2], dilation[3])); NpuOpRunner runner; runner.SetType("DepthwiseConv2DBackpropFilterD") .AddInput(input_tensor) .AddInput(output_grad_tensor) .AddOutput(*filter_grad) .AddAttr("filter_size", phi::vectorize(transformed_filter.dims())) .AddAttr("strides", strides) .AddAttr("dilations", dilations) .AddAttr("pads", padding) .AddAttr("data_format", data_format) .Run(stream); } if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); phi::DenseTensor input_grad_tensor; input_grad_tensor.ShareDataWith(*input_grad); if (channel_last) { input_grad_tensor.set_layout(DataLayout::kNHWC); } NpuOpRunner runner; runner.SetType("DepthwiseConv2DBackpropInputD") .AddInput(transformed_filter) .AddInput(output_grad_tensor) .AddOutput(input_grad_tensor) .AddAttr("input_size", phi::vectorize(input->dims())) .AddAttr("strides", strides) .AddAttr("dilations", dilations) .AddAttr("pads", padding) .AddAttr("data_format", data_format) .Run(stream); } } }; template class NPUConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const phi::DenseTensor* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); phi::DenseTensor input_tensor, output_tensor; input_tensor.ShareDataWith(*input); output_tensor.ShareDataWith(*output); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_tensor.set_layout(DataLayout::kNHWC); strides_vec[1] = strides[0]; strides_vec[2] = strides[1]; dilations_vec[1] = dilations[0]; dilations_vec[2] = dilations[1]; } else { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; } auto stream = ctx.template device_context().stream(); const auto& runner = NpuOpRunner("Conv2D", {input_tensor, *filter}, {output_tensor}, {{"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); } }; template class NPUConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); const bool channel_last = data_format == "NHWC"; // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; if (channel_last) { in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); } else { in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); } filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); phi::DenseTensor input_tensor, output_grad_tensor; input_tensor.ShareDataWith(*input); output_grad_tensor.ShareDataWith(*output_grad); if (channel_last) { input_tensor.set_layout(DataLayout::kNHWC); output_grad_tensor.set_layout(DataLayout::kNHWC); strides_vec[1] = strides[0]; strides_vec[2] = strides[1]; dilations_vec[1] = dilations[0]; dilations_vec[2] = dilations[1]; } else { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; } auto stream = ctx.template device_context().stream(); if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); phi::DenseTensor filter_grad_fp32(phi::DataType::FLOAT32); filter_grad_fp32.Resize(filter_grad->dims()); if (framework::TransToProtoVarType(input->dtype()) == framework::proto::VarType::FP16) { CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32); } else { filter_grad_fp32.ShareDataWith(*filter_grad); } const auto& runner = NpuOpRunner("Conv2DBackpropFilterD", {input_tensor, output_grad_tensor}, {filter_grad_fp32}, {{"filter_size", filter_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); if (framework::TransToProtoVarType(input->dtype()) == framework::proto::VarType::FP16) { CastToFP16(ctx, stream, filter_grad_fp32, filter_grad); } } if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); std::vector input_shape_vec = phi::vectorize(input->dims()); phi::DenseTensor input_grad_tensor; input_grad_tensor.ShareDataWith(*input_grad); if (channel_last) { input_grad_tensor.set_layout(DataLayout::kNHWC); } const auto& runner = NpuOpRunner("Conv2DBackpropInputD", {*filter, output_grad_tensor}, {input_grad_tensor}, {{"input_size", input_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); } } }; template class NPUConv3dKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const phi::DenseTensor* input = ctx.Input("Input"); const phi::DenseTensor* filter = ctx.Input("Filter"); phi::DenseTensor* output = ctx.Output("Output"); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); PADDLE_ENFORCE_EQ(data_format, "NCDHW", platform::errors::Unimplemented( "the data_format must be NCDHW in " "the npu kernel of conv3d, but got data_format " "= [%s]", data_format)); PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented( "the groups must be 1 in " "the npu kernel of conv3d, but got groups " "= [%d]", groups)); output->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); auto input_tensor = ctx.AllocateTmpTensor(input->dims(), dev_ctx); auto filter_tensor = ctx.AllocateTmpTensor(filter->dims(), dev_ctx); auto output_tensor = ctx.AllocateTmpTensor(output->dims(), dev_ctx); input_tensor.ShareDataWith(*input); filter_tensor.ShareDataWith(*filter); output_tensor.ShareDataWith(*output); input_tensor.set_layout(DataLayout::kNCDHW); filter_tensor.set_layout(DataLayout::kNCDHW); output_tensor.set_layout(DataLayout::kNCDHW); // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(5, 1); std::vector dilations_vec(5, 1); strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; strides_vec[4] = strides[2]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; dilations_vec[4] = dilations[2]; auto stream = ctx.template device_context().stream(); const auto& runner = NpuOpRunner("Conv3D", {input_tensor, filter_tensor}, {output_tensor}, {{"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); } }; template class NPUConv3dGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const phi::DenseTensor* input = ctx.Input("Input"); const phi::DenseTensor* filter = ctx.Input("Filter"); const phi::DenseTensor* output_grad = ctx.Input(framework::GradVarName("Output")); phi::DenseTensor* input_grad = ctx.Output(framework::GradVarName("Input")); phi::DenseTensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); const std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); const std::string padding_algorithm = ctx.Attr("padding_algorithm"); const std::string data_format = ctx.Attr("data_format"); PADDLE_ENFORCE_EQ(data_format, "NCDHW", platform::errors::Unimplemented( "the data_format must be NCDHW in " "the npu kernel of conv3d, but got data_format " "= [%s]", data_format)); PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented( "the groups must be 1 in " "the npu kernel of conv3d, but got groups " "= [%d]", groups)); auto& dev_ctx = ctx.template device_context(); auto input_tensor = ctx.AllocateTmpTensor(input->dims(), dev_ctx); auto filter_tensor = ctx.AllocateTmpTensor(filter->dims(), dev_ctx); auto output_grad_tensor = ctx.AllocateTmpTensor( output_grad->dims(), dev_ctx); input_tensor.ShareDataWith(*input); filter_tensor.ShareDataWith(*filter); output_grad_tensor.ShareDataWith(*output_grad); input_tensor.set_layout(DataLayout::kNCDHW); filter_tensor.set_layout(DataLayout::kNCDHW); output_grad_tensor.set_layout(DataLayout::kNCDHW); // update padding and dilation auto in_dims = input->dims(); auto filter_dims = filter->dims(); framework::DDim in_data_dims; framework::DDim filter_data_dims; in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); std::vector strides_vec(5, 1); std::vector dilations_vec(5, 1); strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; strides_vec[4] = strides[2]; dilations_vec[2] = dilations[0]; dilations_vec[3] = dilations[1]; dilations_vec[4] = dilations[2]; auto stream = ctx.template device_context().stream(); if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); phi::DenseTensor filter_grad_tensor = ctx.AllocateTmpTensor(filter_grad->dims(), dev_ctx); filter_grad_tensor.ShareDataWith(*filter_grad); filter_grad_tensor.set_layout(DataLayout::kNCDHW); const auto& runner = NpuOpRunner("Conv3DBackpropFilterD", {input_tensor, output_grad_tensor}, {filter_grad_tensor}, {{"filter_size", filter_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); } if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); std::vector input_shape_vec = phi::vectorize(input->dims()); phi::DenseTensor input_grad_tensor = ctx.AllocateTmpTensor(input_grad->dims(), dev_ctx); input_grad_tensor.ShareDataWith(*input_grad); input_grad_tensor.set_layout(DataLayout::kNCDHW); const auto& runner = NpuOpRunner("Conv3DBackpropInputD", {filter_tensor, output_grad_tensor}, {input_grad_tensor}, {{"input_size", input_shape_vec}, {"strides", strides_vec}, {"pads", paddings}, {"dilations", dilations_vec}, {"groups", groups}, {"data_format", data_format}}); runner.Run(stream); } } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(depthwise_conv2d, ops::DepthwiseConvNPUKernel, ops::DepthwiseConvNPUKernel); REGISTER_OP_NPU_KERNEL(depthwise_conv2d_grad, ops::DepthwiseConvGradNPUKernel, ops::DepthwiseConvGradNPUKernel); REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel, ops::NPUConvOpKernel); REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel, ops::NPUConvGradOpKernel); REGISTER_OP_NPU_KERNEL(conv3d, ops::NPUConv3dKernel, ops::NPUConv3dKernel); REGISTER_OP_NPU_KERNEL(conv3d_grad, ops::NPUConv3dGradKernel, ops::NPUConv3dGradKernel);