diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 65dbb0368c65405a44920167c4c783b2ebe38888..7de7747ebf08ff2b6c7055fe7575131f32f39a87 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -26,14 +26,14 @@ core_ops_args_type_info = {} yaml_types_mapping = { 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ - 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ - 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', - 'Scalar' : 'Scalar', - 'ScalarArray' : 'ScalarArray' + 'Scalar' : 'paddle::experimental::Scalar', + 'ScalarArray' : 'paddle::experimental::ScalarArray' } @@ -208,39 +208,26 @@ def ParseYamlArgs(string): def ParseYamlReturns(string): - # Example: Tensor, Tensor - - # list = [ ["", ret_type, orig_position], ...] - returns_list = [] - - returns = [x.strip() for x in string.strip().split(",")] - for i in range(len(returns)): - ret_type = returns[i] - - assert ret_type in yaml_types_mapping.keys() - ret_type = yaml_types_mapping[ret_type] - - returns_list.append(["", ret_type, i]) - - return returns_list - - -def ParseYamlReturnsWithName(string): - # Example: Tensor(out), Tensor(out1) + # Example0: Tensor(out), Tensor(out1) + # Example1: Tensor, Tensor + # Example2: Tensor[](out), Tensor # list = [ [ret_name, ret_type, orig_position], ...] returns_list = [] returns = [x.strip() for x in string.strip().split(",")] - atype = r'(.*?)' - aname = r'(.*?)' - pattern = f'{atype}\({aname}\)' for i in range(len(returns)): ret = returns[i] - m = re.search(pattern, ret) - ret_type = m.group(1) - ret_name = m.group(2) + + ret_name = "" + if "(" in ret and ")" in ret: + # Remove trailing ')' + ret = ret[:-1] + ret_type = ret.split("(")[0].strip() + ret_name = ret.split("(")[1].strip() + else: + ret_type = ret.strip() assert ret_type in yaml_types_mapping.keys() ret_type = yaml_types_mapping[ret_type] @@ -266,7 +253,7 @@ def ParseYamlForwardFromBackward(string): function_returns = m.group(3) forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) - forward_returns_list = ParseYamlReturnsWithName(function_returns) + forward_returns_list = ParseYamlReturns(function_returns) return forward_inputs_list, forward_attrs_list, forward_returns_list @@ -296,7 +283,7 @@ def ParseYamlBackward(args_str, returns_str): args_str = re.search(args_pattern, args_str).group(1) inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturnsWithName(returns_str) + returns_list = ParseYamlReturns(returns_str) return inputs_list, attrs_list, returns_list diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9c4e102ca450fab6df315786312367d8f66b1b8e..d0506e45eb476c50301f79e787d7272c5425d986 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -16,20 +16,26 @@ import os import argparse from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +skipped_fwd_api_names = set(["scale"]) + atype_to_parsing_function = { "bool": "CastPyArg2Boolean", "int": "CastPyArg2Int", "long": "CastPyArg2Long", + "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", "string": "CastPyArg2String", - "bool[]": "CastPyArg2Booleans", - "int[]": "CastPyArg2Ints", - "long[]": "CastPyArg2Longs", - "float[]": "CastPyArg2Floats", - "double[]": "CastPyArg2Float64s", - "string[]": "CastPyArg2Strings", - "Scalar": "CastPyArg2Scalar", - "ScalarArray": "CastPyArg2ScalarArray" + "std::vector": "CastPyArg2Booleans", + "std::vector": "CastPyArg2Ints", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Floats", + "std::vector": "CastPyArg2Float64s", + "std::vector": "CastPyArg2Strings", + "paddle::experimental::Scalar": "CastPyArg2Scalar", + "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", + "paddle::experimental::Backend": "CastPyArg2Backend", + "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -43,15 +49,9 @@ def ParseArguments(): return args -def GetCxxType(atype): - if atype not in yaml_types_mapping.keys(): - assert False - - return yaml_types_mapping[atype] - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): + print(f"Unable to find {atype} in atype_to_parsing_function.") assert False return atype_to_parsing_function[atype] @@ -59,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, forward_attrs_list, forward_outputs_position_map, - optional_inputs): + optional_inputs, is_forward_only): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] @@ -86,11 +86,10 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, # Get Attributes for name, atype, _, pos in forward_attrs_list: parsing_function = FindParsingFunctionFromAttributeType(atype) - cxx_type = GetCxxType(atype) key = f"{name}" parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" + parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) @@ -127,9 +126,14 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ + if is_forward_only: + fwd_function_name = fwd_api_name + else: + fwd_function_name = GetForwardFunctionName(fwd_api_name) + python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - GetForwardFunctionName(fwd_api_name), dygraph_function_call_str) + fwd_function_name, dygraph_function_call_str) python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" @@ -213,6 +217,11 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #pragma once #include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" @@ -251,19 +260,23 @@ if __name__ == "__main__": python_c_function_list = [] python_c_function_reg_list = [] for fwd_api in fwd_api_list: + # We only generate Ops with grad + is_forward_only = False if 'backward' not in fwd_api.keys(): - continue + is_forward_only = True assert 'api' in fwd_api.keys() assert 'args' in fwd_api.keys() assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() fwd_api_name = fwd_api['api'] fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] + if fwd_api_name in skipped_fwd_api_names: + continue + # Parse Dispensable Inputs optional_inputs = [] if 'optional' in fwd_api.keys(): @@ -285,7 +298,7 @@ if __name__ == "__main__": python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs) + forward_outputs_position_map, optional_inputs, is_forward_only) python_c_function_list.append(python_c_function_str) python_c_function_reg_list.append(python_c_function_reg_str) print("Generated Python-C Function: ", python_c_function_str) diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP(conv2d_transpose); namespace paddle { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::DSizes(); @@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector& perf_results, using framework::ConvSearchCache; -static void SetConvMathType(const framework::ExecutionContext& ctx, - cudnnDataType_t dtype, +static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); @@ -231,8 +230,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; bool has_got_workspace_size = true; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -284,8 +282,7 @@ struct SearchAlgorithm { } else if (deterministic) { algo = static_cast(1); } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -346,8 +343,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -413,8 +409,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -478,8 +473,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -534,8 +528,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = *(framework::ConvSearchCache::Instance().GetBackwardFilter()); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu deleted file mode 100644 index 2055bf560e69ca0ed354aadd00cdca331c22c76e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ /dev/null @@ -1,1478 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the spopecific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/phi/kernels/funcs/padding.h" - -DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; -using DataLayout = platform::DataLayout; - -static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -template -class CUDNNConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - const Tensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - // We will only do data format conversion from NHWC to NCHW. - // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // ------------ transformed tensor ----------- - Tensor transformed_input_channel(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_filter_channel(filter->type()); - T* output_data = nullptr; - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst(ctx, output, - &transformed_output); - - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output.ShareDataWith(*output); - } - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - } else { - transformed_filter_channel.ShareDataWith(*filter); - } - output_data = transformed_output.data(); - - // update padding and dilation - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - std::vector input_pad(transformed_input_channel.dims().size() * 2, - 0); - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_format = GetCudnnTensorFormat(layout); - - args.handle = handle; - -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), groups); -#else - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, - &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - algo = search::Find(args, exhaustive_search, deterministic, - workspace_size, ctx); -#else - cudnnConvolutionFwdAlgo_t algo{}; - using search = SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (ctx.Attr("groups") > 1) { - algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - -// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. -// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; -// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args.idesc.desc(), input_data, - args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, - &beta, args.odesc.desc(), output_data, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, workspace_ptr, workspace_size, &beta, - args.odesc.desc(), output_data + i * group_offset_out)); - }, - workspace_size); - } -#endif - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_output, output); - } - } -}; - -template -class CUDNNConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - } - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - } - - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvGradOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // transform Tensor - Tensor transformed_input_channel(input->type()); - Tensor transformed_output_grad_channel(output_grad->type()); - Tensor transformed_input_grad_channel(input->type()); - Tensor transformed_filter_channel(filter->type()); - Tensor transformed_filter_grad_channel(filter->type()); - - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input, output_grad, input_grad and tensor from " - "NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - TransToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - - if (input_grad) { - ResizeToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy - // the data of input_grad to transformed_input_grad_channel. - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - TransToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - } - } - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output_grad_channel.ShareDataWith(*output_grad); - if (input_grad) { - transformed_input_grad_channel.ShareDataWith(*input_grad); - } - } - - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - - if (filter_grad) { - ResizeToChannelLast( - ctx, filter_grad, &transformed_filter_grad_channel); - } - } else { - transformed_filter_channel.ShareDataWith(*filter); - if (filter_grad) { - transformed_filter_grad_channel.ShareDataWith(*filter_grad); - } - } - - // update paddings - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // cuDNN only supports padding the same amount on every dimension. - // So we create a new padded input tensor. - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_input(input->type()); - Tensor transformed_input_grad(input->type()); - std::vector padding_common(data_dim, 0); - std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - - transformed_input_grad.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (input_grad) { - transformed_input_grad = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - // pad for input - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (input_grad) { - transformed_input_grad.ShareDataWith(transformed_input_grad_channel); - } - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_tensor = GetCudnnTensorFormat(layout); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, - &o_c, &o_d, &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size_d, ctx); -#else - using search1 = SearchAlgorithm; - data_algo = - search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max(workspace_size_d, - search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, - iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size_w, ctx); -#else - using search2 = SearchAlgorithm; - filter_algo = - search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; -#else - ScalingParamType beta = - (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) ? 1.0f : 0.0f; -#endif - VLOG(4) << "Conv_grad: use_addto = " - << (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")); - - if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - Tensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size_d)); - }, - workspace_size_d); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), - transformed_input_grad_data, &alpha, args1.idesc.desc(), - temp_tensor_data, &beta, args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); - }, - workspace_size_d); - } - -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size_d, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size_d); - } -#endif - if (!is_sys_pad) { - std::vector starts(transformed_input_channel.dims().size(), 0); - std::vector axes(transformed_input_channel.dims().size(), 0); - - for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - - transformed_input_grad_channel.mutable_data(ctx.GetPlace()); - if (transformed_input_channel.dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } - } - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_input_grad_channel, input_grad); - } - } - - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), output_grad_data, - args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size_w)); - }, - workspace_size_w); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size_w, &beta_filter, args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size_w); - } -#endif - - if (compute_format == DataLayout::kNHWC) { - TransToChannelFirst( - ctx, &transformed_filter_grad_channel, filter_grad); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv(ddI, W) + conv(I, ddW) - * dW = conv_bp_filter(ddI, dO) - * dI = conv_bp_data(ddW, dO) - */ -template -class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - if (dX) { - transformed_dX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{ - &transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, - dilations, dtype}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, - dilations, dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = - static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = - static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find(args1, exhaustive_search, false, - workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find(args2, exhaustive_search, false, - workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, fwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find(args3, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_algo = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find(args4, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_algo = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), ddx, - args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, - &beta, args1.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, workspace_ptr, workspace_size, &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), - ddw, args2.cdesc.desc(), fwd_algo2, &beta, - args2.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, workspace_ptr, workspace_size, &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), transformed_dy_channel, - args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, - &beta, args3.wdesc.desc(), dw, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - ddx + i * group_offset_in, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), transformed_dy_channel, - args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, - &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue -// Use depthwise_conv2d in MIOPEN to resolve this issue -REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -128,11 +128,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -170,11 +169,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -212,11 +210,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } -#ifndef PADDLE_WITH_ASCEND_CL - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN or NPU is used")); - } -#endif +// #ifndef PADDLE_WITH_ASCEND_CL +// if (input_data_type == framework::proto::VarType::FP16) { +// PADDLE_ENFORCE_EQ( +// library, framework::LibraryType::kCUDNN, +// platform::errors::InvalidArgument( +// "float16 can only be used when CUDNN or NPU is used")); +// } +// #endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { @@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -// depthwise conv kernel -// TODO(xingzhaolong): neon kernel for mobile -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - REGISTER_OP_VERSION(conv2d) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc deleted file mode 100644 index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d, - ops::DepthwiseConvKernel, - ops::DepthwiseConvKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad, - ops::DepthwiseConvGradKernel, - ops::DepthwiseConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output(output->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims = - phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - auto& dev_ctx = context.template device_context(); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim in_matrix_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - Tensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, - T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); - } - } -}; - -template -class GemmConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output_grad(output_grad->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - auto& dev_ctx = context.template device_context(); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - - framework::DDim input_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->dtype()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), - &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); - } - } - } - } -}; - -template -class GemmConvDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - const Tensor* X = ctx.Input("Input"); - const Tensor* dY = ctx.Input("DOutput"); - const Tensor* ddX = ctx.Input("DDInput"); - const Tensor* ddW_in = ctx.Input("DDFilter"); - - Tensor* ddY = ctx.Output("DDOutput"); - Tensor* dW = ctx.Output("DFilter"); - Tensor* dX = ctx.Output("DInput"); - Tensor W = GET_DATA_SAFELY(ctx.Input("Filter"), "Input", "Filter", - "GemmConvDoubleGrad"); - if (!ddY && !dW && !dX) return; - - const int groups = ctx.Attr("groups"); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_X(X->dtype()); - Tensor transformed_dY(dY->dtype()); - Tensor transformed_ddX(X->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X); - TransToChannelFirst(ctx, X, &transformed_X); - - ResizeToChannelFirst(ctx, dY, &transformed_dY); - TransToChannelFirst(ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX); - TransToChannelFirst(ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(phi::vectorize(W.dims())); - std::vector output_shape_vec( - phi::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - framework::DDim input_shape = - phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - framework::DDim filter_matrix_shape = {W.dims()[0], - W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col = ctx.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dX->mutable_data(ctx.GetPlace()); - - Tensor transformed_dX(dX->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dW->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dW, static_cast(0)); - Tensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, - T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - - Tensor transformed_ddY(ddY->dtype()); - if (channel_last) { - ResizeToChannelFirst(ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - Tensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(0.0)); - } - - if (ddW_in) { - Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape); - Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, x_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddY, ddY); - } - } - } -}; - -template -class DepthwiseConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1] % - input->dims()[input->dims().size() - 1], - 0, platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1] % input->dims()[1], 0, - platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], input->dims()[1])); - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - auto& dev_ctx = context.template device_context(); - - if (fuse_relu) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } else { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } - } -}; - -template -class DepthwiseConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (fuse_relu) { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } else { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - if (fuse_relu) { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } else { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 141a99f60f104c3bf32e16a1254d0f5eec623645..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + algo = search::Find( + args, false, deterministic, workspace_size, + ctx.template device_context()); #else using search = SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + algo = search::Find( + args, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); #endif @@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = - search1::Find(args1, false, deterministic, workspace_size, ctx); + data_algo = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); + data_algo = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); #endif @@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = - search2::Find(args2, false, deterministic, workspace_size, ctx); + filter_algo = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); + filter_algo = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); #endif @@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = - search1::Find(args1, false, deterministic, workspace_size, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); #endif } @@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = - search2::Find(args2, false, deterministic, workspace_size, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); #endif @@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = - search3::Find(args3, false, deterministic, workspace_size, ctx); + filter_algo = search3::Find( + args3, false, deterministic, workspace_size, + ctx.template device_context()); #else using search3 = SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); + filter_algo = search3::Find( + args3, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); #endif @@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = - search4::Find(args4, false, deterministic, workspace_size, ctx); + data_algo = search4::Find( + args4, false, deterministic, workspace_size, + ctx.template device_context()); #else using search4 = SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); + data_algo = search4::Find( + args4, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); #endif diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ b/paddle/fluid/operators/conv_transpose_op.cu @@ -13,10 +13,150 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ( + groups, filter.dims()[0], + platform::errors::InvalidArgument( + "groups should be error to the 1st dimension of filter. But " + "received groups is %d and filter dimension[0] is %d", + groups, filter.dims()[0])); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( + "dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + static_cast::TYPE&>(dev_ctx), + *output, filter, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, output, data_layout); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv( + static_cast::TYPE&>(dev_ctx), + *output_grad, filter, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, input_grad, data_layout); + } + + if (filter_grad) { + phi::funcs::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + static_cast::TYPE&>(dev_ctx), + *output_grad, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, filter_grad, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle // conv2d REGISTER_OP_CUDA_KERNEL(conv2d_transpose, ops::GemmConvTransposeKernel, diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } }; -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - dev_ctx, *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - dev_ctx, *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - dev_ctx, *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h deleted file mode 100644 index c13bf687af23470d4595def6fb6fabf7385c999f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distribution_helper.h +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -#endif - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/core/hostdevice.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/kernels/primitive/kernel_primitives.h" -#endif - -#if !defined(_WIN32) -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -#else -// there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition) -#endif - -namespace paddle { -namespace distribution { - -using Tensor = framework::Tensor; - -/********************* Transformation Function **********************/ -template -struct exponential_transform { - explicit exponential_transform(T lambda) : lambda_(lambda) {} - - HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) - if (std::is_same::value) { - return static_cast(-1.0) / lambda_ * log(val); - } else { - return static_cast(-1.0) / lambda_ * __logf(val); - } -#else - return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); -#endif - } - - private: - T lambda_; -}; - -template -struct uniform_transform { - explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} - - HOSTDEVICE inline T operator()(T val) const { - if (UNLIKELY(val == static_cast(1.0))) { - return min_; - } else { - return val * range_ + min_; - } - } - - private: - T range_; - T min_; -}; - -template -struct normal_transform { - explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} - - HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } - - private: - T mean_; - T std_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) - -namespace kps = phi::kps; - -/*********************** Distribution Function *************************/ -template -struct uniform_distribution; - -template -struct normal_distribution; - -#if defined(__NVCC__) -template <> -struct uniform_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -#else -template <> -struct uniform_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; -#endif - -/******** Launch GPU function of distribution and transformation *********/ -template -__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, - DistOp dist, TransformOp trans, T *out_data, - size_t stride) { - size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); - static constexpr int kCount = DistOp::kReturnsCount; -#if defined(__NVCC__) - curandStatePhilox4_32_10_t state; - curand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = curandStatePhilox4_32_10_t; -#else - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = hiprandStatePhilox4_32_10_t; -#endif - size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; - T args[kCount]; - T result[kCount]; - for (size_t i = idx; i < size; i += total_thread * kCount) { - kps::ElementwiseRandom(&args[0], dist, &state); - kps::ElementwiseUnary(&result[0], &args[0], - trans); - kps::WriteData(out_data + i, &result[0], size - i, - 1, stride, 1); - __syncthreads(); - } -} - -template -void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, - Tensor *out, DistOp dist, TransformOp trans) { - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - auto size = out->numel(); - - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - size_t block_size = 256; - size_t expect_grid_size = (size + block_size - 1) / block_size; - const auto &prop = platform::GetDeviceProperties(device_id); - size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * - prop.multiProcessorCount; - size_t grid_size = - expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size; - - size_t total_thread = block_size * grid_size; - size_t curand4_loop_times = - (size + 4 * total_thread - 1) / (4 * total_thread); - // 'increment' shoulde be multiple of 4 - uint64_t increment = curand4_loop_times * 4; - - auto seed_offset = gen_cuda->IncrementOffset(increment); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - DistributionKernel< - T, DistOp, TransformOp><<>>( - size, seed, offset, dist, trans, out_data, total_thread); -} - -#endif - -} // namespace distribution -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index cdcf683fb92c5a5ef56f61da15e5979fd1364945..dcdab033e8f8014214900727d53f329e5a7b4ab4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -34,8 +34,8 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -86,8 +86,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, bool is_upscale_in_train, uint64_t increment) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; #ifdef PADDLE_WITH_HIP int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -102,7 +102,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, MT factor = static_cast(1.0f / (1.0f - dropout_prob)); for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) { LoadT src_val; - platform::Load(&src[i], &src_val); + phi::Load(&src[i], &src_val); #ifdef PADDLE_WITH_HIP float4 rand = hiprand_uniform4(&state); @@ -126,8 +126,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, } } - platform::Store(dst_val, &dst[i]); - platform::Store(mask_val, &mask[i]); + phi::Store(dst_val, &dst[i]); + phi::Store(mask_val, &mask[i]); } } @@ -153,16 +153,16 @@ __global__ void DropoutGradCUDAKernel( const typename details::MPTypeTrait::Type factor, const int64_t size, T* dx) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_val; - platform::Load(&dout[i], &dout_val); + phi::Load(&dout[i], &dout_val); MaskLoadT mask_val; - platform::Load(&mask[i], &mask_val); + phi::Load(&mask[i], &mask_val); LoadT dx_val; @@ -172,7 +172,7 @@ __global__ void DropoutGradCUDAKernel( static_cast(mask_val[j]) * factor); } - platform::Store(dx_val, &dx[i]); + phi::Store(dx_val, &dx[i]); } } @@ -219,7 +219,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, uint64_t increment; // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; - int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; + int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index ee456dcdafbc51d547e7beacc4e4e79f98738b88..1a48a6767852e138e7725a68ca4ffc56de8234be 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -76,7 +76,7 @@ class ExponentialKernel auto engine = gen->GetCPUEngine(); std::uniform_real_distribution uniform(0.0, 1.0); - distribution::exponential_transform trans(lambda); + phi::funcs::exponential_transform trans(lambda); for (int64_t i = 0; i < size; ++i) { out_data[i] = trans(uniform(*engine)); } diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu index 8b989501e4f4248b0c2e3b23e1e75a4865b08588..d5abbf9a26afe6bcbbd8549f59d632fc4e53fec2 100644 --- a/paddle/fluid/operators/exponential_op.cu +++ b/paddle/fluid/operators/exponential_op.cu @@ -26,9 +26,9 @@ class ExponentialKernel auto& dev_cxt = ctx.template device_context(); T lambda = static_cast(ctx.Attr("lambda")); - distribution::uniform_distribution dist; - distribution::exponential_transform trans(lambda); - distribution::distribution_and_transform(dev_cxt, out, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::exponential_transform trans(lambda); + phi::funcs::distribution_and_transform(dev_cxt, out, dist, trans); } }; diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h index fbcabc594db0814da1ec50934a0f02514dc208be..7ded174a9f47ede48a49b19b25539867ce344fb0 100644 --- a/paddle/fluid/operators/exponential_op.h +++ b/paddle/fluid/operators/exponential_op.h @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distribution_helper.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 20801d2243fb395b250f8416f1e2f5ba6a1423a4..3a2de0c4a093514a1c40321ab7dad61011709204 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary( template void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n, const T* in0, const T* in1, T* out) { - int in_vec_size = std::min(platform::GetVectorizedSize(in0), - platform::GetVectorizedSize(in1)); - int out_vec_size = std::min(4, platform::GetVectorizedSize(out)); + int in_vec_size = + std::min(phi::GetVectorizedSize(in0), phi::GetVectorizedSize(in1)); + int out_vec_size = std::min(4, phi::GetVectorizedSize(out)); int vec_size = std::min(out_vec_size, in_vec_size); int numel = m * n; @@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num, int num_block = (max_threads / left_num); if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block); + *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block); if (*blocking_size <= 1) { - *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num)); + *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num)); } else if (*blocking_size * 2 < reduce_num) { *blocking_size *= 2; } diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 1864bdbb86667290474d297cc481f5d6352c8022..b3792a176fabeb8406fd2f1b83c6723207dad2f1 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,10 +30,10 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(conv2d); -USE_OP(conv2d_grad); -USE_OP_DEVICE_KERNEL(conv2d, CUDNN); -USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); +USE_OP_ITSELF(conv2d); +USE_OP_ITSELF(conv2d_grad); +PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT); template void InitRandomTensor(const std::vector &dims, diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 994601a2f0608b4fc04966c7549c421f395f3ec7..9f5a1bad047b44b715e11e74d92fdca1982c96f8 100755 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, const T factor, const int64_t size, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; LoadT src_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); - platform::Load(&src[i], &src_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); + phi::Load(&src[i], &src_vec); StoreT dx_vec; #pragma unroll @@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, T tmp = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]); } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, T *dx, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum if (col_id * VecSize < cols) { @@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, LoadT bias_vec; MaskLoadT mask_vec; - platform::Load(&dout[index], &dout_vec); - platform::Load(&src[index], &src_vec); - platform::Load(&mask[index], &mask_vec); - platform::Load(&bias[col_id * VecSize], &bias_vec); + phi::Load(&dout[index], &dout_vec); + phi::Load(&src[index], &src_vec); + phi::Load(&mask[index], &mask_vec); + phi::Load(&bias[col_id * VecSize], &bias_vec); StoreT dx_vec; #pragma unroll @@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, dx_vec[i] = val; tmp_sum[i] += val; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index f79277e4e8f0d22cedafc9f7b40b56ecd2d6a817..6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -21,11 +21,11 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index ceba3accca7727b5e4f22951d87f9e91034e3403..d53a24a57e3cc1ede127f497a9be9e3b5fa1ab0b 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -42,12 +42,12 @@ __device__ void CalcLayernormY( const LayerNormScaleBiasT *bias, const T *x, T *y, const int row_id, const int col_id, const int cols, const LayerNormParamType mean_val, const LayerNormParamType invvar) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using LoadU = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using LoadU = phi::AlignedVector; using LoadScaleOrBias = - platform::AlignedVector, - VecSize>; + phi::AlignedVector, + VecSize>; for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) { LoadScaleOrBias scale_vec; LoadScaleOrBias bias_vec; @@ -60,15 +60,15 @@ __device__ void CalcLayernormY( static_cast>(0); } // vectorize load data from global - platform::Load(&x[row_id * cols + i], &x_vec); + phi::Load(&x[row_id * cols + i], &x_vec); if (scale != nullptr) { - platform::Load, - VecSize>(&scale[i], &scale_vec); + phi::Load, VecSize>( + &scale[i], &scale_vec); } if (bias != nullptr) { - platform::Load, - VecSize>(&bias[i], &bias_vec); + phi::Load, VecSize>( + &bias[i], &bias_vec); } StoreT y_vec; @@ -78,7 +78,7 @@ __device__ void CalcLayernormY( (static_cast(x_vec[ii]) - mean_val) * invvar + static_cast(bias_vec[ii])); } - platform::Store(y_vec, &y[row_id * cols + i]); + phi::Store(y_vec, &y[row_id * cols + i]); } } @@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec residual[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); - platform::Load( - residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(residual_ptr + row * LN_NUM_COLS + col * VecSize, + &residual[it]); col += THREADS_PER_ROW; } @@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( // store dropout_residual_out and mask_out #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store( + phi::Store( x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize); - platform::Store( + phi::Store( mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } @@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 1b135ad6098e58f457f5d21e73ac6d1a6a7c4074..1d3085a013f81ee9dca21468476df8f621bb26c2 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val, Functor act_func) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; using U = typename details::MPTypeTrait::Type; LoadT src_vec; @@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( residual_vec[ii] = static_cast(0); } // vectorize load data from global - platform::Load(&src[row_id * cols + col_id], &src_vec); + phi::Load(&src[row_id * cols + col_id], &src_vec); if (residual) { - platform::Load(&residual[row_id * cols + col_id], - &residual_vec); + phi::Load(&residual[row_id * cols + col_id], &residual_vec); } if (bias) { - platform::Load(&bias[col_id], &bias_vec); + phi::Load(&bias[col_id], &bias_vec); } MaskStoreT mask_vec; @@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( } // store result to global - platform::Store(dest_vec, &dst[row_id * cols + col_id]); + phi::Store(dest_vec, &dst[row_id * cols + col_id]); if (!is_test) { - platform::Store(mask_vec, &mask[row_id * cols + col_id]); + phi::Store(mask_vec, &mask[row_id * cols + col_id]); } } @@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum @@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, LoadT out_vec; MaskLoadT mask_vec; StoreT dx_vec; - platform::Load(&dout[index], &out_vec); - platform::Load(&mask[index], &mask_vec); + phi::Load(&dout[index], &out_vec); + phi::Load(&mask[index], &mask_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, tmp_sum[i] += out_vec[i]; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index d419bd70e67db27b49d9abccd3dba3227692337a..717ec774414bf892218b6e6df73dbcd57ca3066d 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -19,9 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/operators/index_impl.cu.h" + +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" DECLARE_bool(use_curand); @@ -79,10 +80,10 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int64_t gen_offset = size * seed_offset.second; auto func = GaussianGenerator(mean, std, seed_offset.first, seed_offset.second); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } else { auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } }; diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 6b778eee4345170a0288bc5741c6c1078615022f..ef836ab72f001a540e081d7e9975ca5ee28758be 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT in_arr = *reinterpret_cast(x + offset); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT x_in_arr = *reinterpret_cast(x + offset); ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); #pragma unroll @@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(y, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ @@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ is_aligned(x_g, kAlignment)) { \ diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index 2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f..bb26e2f445e7034b8f982594216eacfd3007a24f 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -19,11 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace paddle { @@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int numel = out->numel(); T *out_data = out->mutable_data(dev_ctx.GetPlace()); if (numel <= 0) return; - int vec_size = paddle::platform::GetVectorizedSize(out_data); + int vec_size = phi::GetVectorizedSize(out_data); #ifdef PADDLE_WITH_XPU_KP int block = 64; int grid = 8; diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 62c21dd2eee401e5f8a526870015c18cf13ee873..412ae3c49b5f3cc9fc2422aa220af324e6d99b69 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { namespace operators { @@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } @@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr, T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr, T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); col += THREADS_PER_ROW; } @@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, - &dout[it]); - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, + &dout[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); if (isFusedDropoutResidualLn) { - platform::Load( + phi::Load( mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]); } @@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Store(x[it], - dx_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize); if (isFusedDropoutResidualLn) { - platform::Store( + phi::Store( dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize); } col += THREADS_PER_ROW; @@ -641,7 +637,7 @@ template < __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_, ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) { - using Vec = platform::AlignedVector; + using Vec = phi::AlignedVector; static_assert(VEC_COLS == LN_NUM_COLS / VecSize, ""); const int tidx = threadIdx.x; @@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( for (int row = r; row < rows; row += ROWS_PER_CTA) { Vec dg; Vec db; - platform::Load(dg_part_ptr, &dg); - platform::Load(db_part_ptr, &db); + phi::Load(dg_part_ptr, &dg); + phi::Load(db_part_ptr, &db); dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h deleted file mode 100644 index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -using DataLayout = framework::DataLayout; - -/* - * \brief Compute the depthwise convolution which include - * forward process and backpropagation process - */ -template -class DepthwiseConvFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvInputGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* input_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvFilterGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* filter_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace platform { class CPUDeviceContext; @@ -141,6 +143,116 @@ class Vol2ColFunctor { } }; +template +class Vol2ColFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* col, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol.dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol.dims().size())); + + PADDLE_ENFORCE_EQ(col->dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col->dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + // changed + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + c_in; + } + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + /* * vol = [input_channels,input_depth, input_height, input_width] * col = @@ -258,10 +370,125 @@ class Col2VolFunctor { } }; +template +class Col2VolFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* vol, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol->dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol->dims().size())); + + PADDLE_ENFORCE_EQ(col.dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col.dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = + ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + cIm; + } + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + template class Vol2ColFunctor; template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; + template class Col2VolFunctor; template class Col2VolFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..05cd264cf3ec9ee6e47d822d7e4d79ab7cd64441 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,7 +33,7 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { @@ -55,7 +55,7 @@ class CacheTester { onednn_dev_ctx_->ResetBlobMap(nullptr); } - bool Analyze(unsigned short int num_entries) { + bool Analyze(uint16_t num_entries) { // Number of created objects in cache should be as expected (num_entries) return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; } diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index ab8b4f2b8f4d37d4be62c5e1dd040a1461d0bdee..a3fbb0e59e24e9be67da5048ebc644f08b385bbf 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, PADDLE_ENFORCE_NE( static_cast(x), static_cast(y), platform::errors::InvalidArgument("Inplace cast is not supported yet.")); - int vec_size = - std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y)); + int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y)); switch (vec_size) { case 4: return details::VecCastKernel(ctx, x, y, n); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 8bb4606ffff151c6f65606d8dce156f98589a6b4..5b60f65442b55dc89a845859f153048e89704f70 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -19,11 +19,11 @@ #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" #include "paddle/fluid/operators/tensor_to_string.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -66,8 +66,8 @@ struct L2NormFunctor { int i; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += (BlockDim * VecSize)) { - platform::AlignedVector tmp_vec; - platform::Load(ptr + i, &tmp_vec); + phi::AlignedVector tmp_vec; + phi::Load(ptr + i, &tmp_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { auto tmp = static_cast(tmp_vec[j]); @@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int max_load_bits = 128; int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); auto address = reinterpret_cast(ptr); - constexpr int vec8 = alignof(platform::AlignedVector); - constexpr int vec4 = alignof(platform::AlignedVector); - constexpr int vec2 = alignof(platform::AlignedVector); + constexpr int vec8 = alignof(phi::AlignedVector); + constexpr int vec4 = alignof(phi::AlignedVector); + constexpr int vec2 = alignof(phi::AlignedVector); chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); @@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector x_vec; - platform::AlignedVector y_vec; + phi::AlignedVector x_vec; + phi::AlignedVector y_vec; - platform::Load(x + i, &x_vec); + phi::Load(x + i, &x_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { y_vec[j] = static_cast(static_cast(x_vec[j]) * s); } - platform::Store(y_vec, y + i); + phi::Store(y_vec, y + i); } for (; i < num; ++i) { @@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector param_vec; - platform::AlignedVector grad_vec; - platform::AlignedVector mom1_vec; - platform::AlignedVector mom2_vec; - platform::AlignedVector trust_ratio_div_vec; + phi::AlignedVector param_vec; + phi::AlignedVector grad_vec; + phi::AlignedVector mom1_vec; + phi::AlignedVector mom2_vec; + phi::AlignedVector trust_ratio_div_vec; T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; if (cur_weight_decay != static_cast(0.0)) { - platform::Load(param_p + i, ¶m_vec); + phi::Load(param_p + i, ¶m_vec); } else { #pragma unroll for (int j = 0; j < VecSize; ++j) { param_vec[j] = static_cast(0); } } - platform::Load(grad_p + i, &grad_vec); - platform::Load(mom1_p + i, &mom1_vec); - platform::Load(mom2_p + i, &mom2_vec); + phi::Load(grad_p + i, &grad_vec); + phi::Load(mom1_p + i, &mom1_vec); + phi::Load(mom2_p + i, &mom2_vec); #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ __trust_ratio_div, __idx) \ @@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( mom2_vec, trust_ratio_div_vec, j); } - platform::Store(mom1_vec, mom1_p + i); - platform::Store(mom2_vec, mom2_p + i); - platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + phi::Store(mom1_vec, mom1_p + i); + phi::Store(mom2_vec, mom2_p + i); + phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i); } for (; i < num; ++i) { @@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor { trust_ratio_div += offset; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { - platform::AlignedVector trust_ratio_div_vec; - platform::Load(trust_ratio_div + i, &trust_ratio_div_vec); + phi::AlignedVector trust_ratio_div_vec; + phi::Load(trust_ratio_div + i, &trust_ratio_div_vec); if (HasMasterParam) { - platform::AlignedVector master_param_vec; - platform::Load(master_param + i, &master_param_vec); - platform::AlignedVector param_vec; + phi::AlignedVector master_param_vec; + phi::Load(master_param + i, &master_param_vec); + phi::AlignedVector param_vec; #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; master_param_vec[j] = p; param_vec[j] = static_cast(p); } - platform::Store(master_param_vec, master_param + i); - platform::Store(param_vec, param + i); + phi::Store(master_param_vec, master_param + i); + phi::Store(param_vec, param + i); } else { - platform::AlignedVector param_vec; - platform::Load(param + i, ¶m_vec); + phi::AlignedVector param_vec; + phi::Load(param + i, ¶m_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; param_vec[j] = static_cast(p); } - platform::Store(param_vec, param + i); + phi::Store(param_vec, param + i); } } diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index df5da1b79535cc6f5e4a638e9d32c367ea7cdb9f..fe5cd066864b82c734614e33869dff1734bee6d0 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate( T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, const int grid_stride, const int numel, MT* master_param_out = nullptr) { - using VecType = paddle::platform::AlignedVector; - using VecMType = paddle::platform::AlignedVector; + using VecType = phi::AlignedVector; + using VecMType = phi::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu index e8e4ff7010d3df01cda514d51796b789ef5e1da6..a724524716be39e554c6046ca809624b7fbb053a 100644 --- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu +++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu @@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) { } if (is_valid) { - phi::kernels::details::CheckReduceRank(reduce_rank, rank); + phi::funcs::details::CheckReduceRank(reduce_rank, rank); } else { - ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank), + ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank), paddle::platform::EnforceNotMet); } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 3aab906804f7adb95f80aa2675f01217b0b48d39..eb76eee104889042e470e65414a011afd0420d0f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -23,8 +23,7 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace paddle { namespace operators { @@ -37,7 +36,7 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::kernels::TensorReduceImpl( + phi::funcs::TensorReduceImpl( static_cast(dev_ctx), x, y, transform, origin_reduce_dims, stream); } diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 88ef1f3ea4aa4d8d827a810026575c20e596b4e7..0372a79b967a48c63ad7912df264b1e519485c03 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - return UnaryOpUnchangedInferShape(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -121,7 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, - ops::SeluGradMaker); + ops::SeluGradMaker, + SeluInferShapeFunctor); + REGISTER_OPERATOR(selu_grad, ops::SeluGradOp); diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index a864c48ad757411861b6d2b3be40361c347601f8..b941dc21c3ab213e5abc2c4c908413b2b6222c41 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -25,8 +25,9 @@ DECLARE_bool(use_curand); #include #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/operators/index_impl.cu.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #endif namespace paddle { @@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context, if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_cxt, tensor, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::uniform_real_transform trans(min, max); + phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; auto func = UniformGeneratorOffset(min, max, seed_offset.first, diag_num, diag_step, diag_val, gen_offset); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } #endif diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h index f26c4fdd17ad7290c71eddf80874f7fa9e115e4f..39eefab774dbe84801bda98c9821d8c801e7fd25 100644 --- a/paddle/fluid/platform/fast_divmod.h +++ b/paddle/fluid/platform/fast_divmod.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #define INT_BITS 32 @@ -25,7 +25,7 @@ namespace platform { struct FastDivMod { // 1st value represents the result of input number divides by recorded divisor // 2nd value represents the result of input number modulo by recorded divisor - using DivModT = AlignedVector; + using DivModT = phi::AlignedVector; FastDivMod() {} HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) { diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 35dbc96874d3c8e09fec3ad9b440619b180647fe..46cbb3358c6c4d6b2b17cfc1e549db6376931389 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -95,7 +95,7 @@ std::unique_ptr Profiler::Stop() { collector.ThreadNames(); for (const auto& kv : thread_names) { extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first), - kv.second); + std::string("%s"), kv.second.c_str()); } return std::unique_ptr( new platform::ProfilerResult(std::move(tree), extrainfo)); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 7647930ef079a8f0863ee1e5ab9e487b6a12aacb..0cfb08345b69770659f1f2caa56b35c994bd1535 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -757,7 +757,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, if (obj == Py_None) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "int, float, bool or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -784,7 +784,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "int, float, bool or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -801,7 +801,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( if (obj == Py_None) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "list or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -821,7 +821,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "list or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -830,5 +830,44 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( return paddle::experimental::ScalarArray({1}); } +paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "int or place, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + PyTypeObject* type = obj->ob_type; + auto type_name = std::string(type->tp_name); + if (type_name == "int") { + int value = CastPyArg2Int(obj, op_type, arg_pos); + return static_cast(value); + } else { + platform::Place place = CastPyArg2Place(obj, arg_pos); + return phi::TransToPhiBackend(place); + } + + return paddle::experimental::Backend::CPU; +} + +paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "data_type, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos); + return framework::TransToPhiDataType(type); +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 6e990691776ef92d5cbdab41bf85c0c9be8ca658..c5da1bb37af733ea7512ff5061f26a6d03f827dd 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -11,6 +11,8 @@ limitations under the License. */ #pragma once #include +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" @@ -100,6 +102,14 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, paddle::experimental::ScalarArray CastPyArg2ScalarArray( PyObject* obj, const std::string& op_type, ssize_t arg_pos); +paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); + +paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); + paddle::optional GetOptionalTensorFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td index 00f94805c7db22e170c7395598bfe647174339c1..ecd7093e72b8ad41a6385acb81dacfb6a0c4197b 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/infrt_ops.td @@ -17,3 +17,10 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> { OptionalAttr:$attrs); let results = (outs Variadic); } + +def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { + let summary = "convert tensor type op"; + let description = [{convert tensor type op!}]; + let arguments = (ins AnyType:$input); + let results = (outs AnyType:$output); +} diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index d477b6b9bdc278b2408794fa4235d9c8bca5850a..a2677a946cb7e8377b780fd79794ee84dec8eb3c 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -5,5 +5,8 @@ endif() add_subdirectory(ir) add_subdirectory(pass) +add_executable(phi-ir-exec phi_ir_exec.cc) +target_link_libraries(phi-ir-exec infrt) + add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index e9591e7f6d7e7d3bffdbef3d1fa3b81e53d9fc57..671646b9259ccfd2399862d71d6860db93608eb8 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -3,6 +3,7 @@ include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" +include "mlir/Interfaces/InferTypeOpInterface.td" def PHI_Dialect : Dialect { let name = "phi"; diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 6c0f6df892100b5a4d3c429d1248eb7851db4609..12a6cfcc3e4a810d19b5023cf97a6d739d50fb1f 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -16,8 +16,10 @@ #include #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/kernel_registry.h" -namespace infrt { +#include "paddle/phi/kernels/declarations.h" +namespace infrt { +namespace { phi::Backend cvtTarget2Phi(TargetType target) { switch (target) { case TargetType::CPU: @@ -124,19 +126,76 @@ Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) { cvtLayoutFromPhi(tensor_arg.layout)); } +} // namespace + +std::string getPhiTargetPrefix(TargetType target) { + switch (target) { + case TargetType::CPU: + return "phi_cpu."; + case TargetType::GPU: + return "phi_gpu."; + default: + LOG(FATAL) << "UnSupported target type !"; + return std::string(); + } +} +std::string getPhiPrecisionSuffix(PrecisionType precision) { + switch (precision) { + case PrecisionType::FLOAT32: + return ".float32"; + case PrecisionType::FLOAT16: + return ".float16"; + case PrecisionType::FLOAT64: + return ".float64"; + case PrecisionType::UINT8: + return ".uint8"; + case PrecisionType::INT8: + return ".int8"; + case PrecisionType::INT16: + return ".int16"; + case PrecisionType::INT32: + return ".int32"; + case PrecisionType::INT64: + return ".int64"; + case PrecisionType::COMPLEX64: + return ".complex64"; + case PrecisionType::COMPLEX128: + return ".complex128"; + case PrecisionType::BOOL: + return ".bool"; + default: + LOG(FATAL) << "UnSupported precision type !"; + return std::string(); + } +} +std::string getPhiLayoutSuffix(LayoutType layout) { + switch (layout) { + case LayoutType::NCHW: + return ".nchw"; + case LayoutType::NHWC: + return ".nhwc"; + case LayoutType::ANY: + return ".any"; + default: + LOG(FATAL) << "UnSupported layout type !"; + return std::string(); + } +} + std::vector getCandidateKernels( std::string name, const std::vector& valid_palces) { std::vector candidate_kernels; PhiKernelDesc phi_kernel_desc; phi::KernelKeyMap kernel_key_map = phi::KernelFactory::Instance().SelectKernelMap(name); - for (const Place& place : valid_palces) { + for (Place place : valid_palces) { phi::KernelKey kernel_key = cvtPlace2Phi(place); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) { kernel_key = phi::KernelKey(kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; + place.layout = LayoutType::ANY; } phi_kernel_desc.kernelType = place; phi_kernel_desc.inputsType.clear(); diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index b74107f674e51f6ca09c864d197d9334a08666ac..34fd2f0f62dcd9b793f9157003bfd3772d0e1307 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -26,6 +26,10 @@ struct PhiKernelDesc { Place kernelType; // kernel place }; +std::string getPhiTargetPrefix(TargetType target); +std::string getPhiPrecisionSuffix(PrecisionType precision); +std::string getPhiLayoutSuffix(LayoutType layout); + std::vector getCandidateKernels( std::string name, const std::vector& valid_palces); diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc index df3472aa01dfb8bfa0e7f6122410c1b4788cd359..376ab31938a97b8501f35ba29612859ce2130772 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -18,11 +18,14 @@ #include #include #include +#include +#include #include #include #include #include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" @@ -58,8 +61,8 @@ void phiOpCvtPass::convertStage() { continue; } - phi::KernelSignature kernel_sign = - phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + ::phi::KernelSignature kernel_sign = + ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( ProtoArgumentMappingContext(op)); // resort input&output according to kernel_sign ::llvm::SmallVector inputs, ori_output; @@ -104,13 +107,92 @@ void phiOpCvtPass::diapatchStage() { infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null(&op); if (nullptr != kernel_op) worklist.push_back(kernel_op); } - // ToDo: implementation in the next PR - while (!worklist.empty()) { - // infrt::KernelOp kernel_op = worklist.back(); - worklist.pop_back(); - // std::string kernel_name = kernel_op.name().str(); - // std::vector candidates = - // getCandidateKernels(kernel_name, valid_places_); + + mlir::OpBuilder builder(&block, block.begin()); + std::map phi_context; + for (infrt::KernelOp kernel_op : worklist) { + std::string kernel_name = kernel_op.name().str(); + std::vector candidates = + getCandidateKernels(kernel_name, valid_places_); + if (candidates.empty()) { + LOG(FATAL) << "No candidate kernels for op:" << kernel_name; + continue; + } + builder.setInsertionPoint(kernel_op); + + // Todo: Implimentation the concrete pass pick strategy + const PhiKernelDesc &phi_kernel_desc = candidates.front(); + + kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + kernel_name + + getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) + + getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision); + + // mlir::OperationName operation_name = kernel_op.getOperation()->getName(); + + mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); + mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); + + if (phi_context.find(phi_kernel_desc.kernelType.target) == + phi_context.end()) { + switch (phi_kernel_desc.kernelType.target) { + case TargetType::CPU: { + auto alloctor_value = + builder + .create( + kernel_op.getLoc(), + phi::AllocatorType::get(kernel_op.getContext(), + TargetType::CPU)) + .output(); + auto context_value = + builder + .create( + kernel_op.getLoc(), + phi::ContextType::get(kernel_op.getContext(), + TargetType::CPU), + alloctor_value) + .output(); + phi_context[TargetType::CPU] = context_value; + } break; + case TargetType::GPU: + case TargetType::UNK: + default: + LOG(FATAL) << "Unsupported TargetType"; + break; + } + } + operation_state.addOperands( + phi_context.at(phi_kernel_desc.kernelType.target)); + for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { + mlir::Value input = kernel_op.getOperand(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), + DenseTensorType::get(kernel_op.getContext(), + phi_kernel_desc.inputsType[index].target, + phi_kernel_desc.inputsType[index].precision, + phi_kernel_desc.inputsType[index].layout), + input); + operation_state.addOperands(cvt_tensor_type_op.output()); + } + for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + ++index) { + operation_state.addTypes( + DenseTensorType::get(kernel_op.getContext(), + phi_kernel_desc.outputsType[index].target, + phi_kernel_desc.outputsType[index].precision, + phi_kernel_desc.outputsType[index].layout)); + } + operation_state.addAttributes(kernel_op.attrsAttr().getValue()); + mlir::Operation *phi_operation = builder.createOperation(operation_state); + for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + ++index) { + mlir::Value input = phi_operation->getResult(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); + kernel_op.getResult(index).replaceAllUsesWith( + cvt_tensor_type_op.output()); + } + kernel_op.erase(); } } } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index ca8a22a7e75d33de6e9f510aea5aab8c24255c36..e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/arg_map_context.h" namespace infrt { -class ProtoArgumentMappingContext : public phi::ArgumentMappingContext { +class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext { public: // only support op in pd dialect explicit ProtoArgumentMappingContext(mlir::Operation* op) diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc index 4e99661a6a20590e7d36c1cf3a0e1e5d334b2464..a2808a00cb67da582ce4fc8b995772725d79e47e 100644 --- a/paddle/infrt/dialect/phi/phi_exec.cc +++ b/paddle/infrt/dialect/phi/phi_exec.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,37 +11,46 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include -#include -#include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/mlir_loader.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" -int main(int argc, char** argv) { - static llvm::cl::opt input_file( - llvm::cl::Positional, - llvm::cl::desc("Specify input filename"), - llvm::cl::init("-")); - - llvm::cl::ParseCommandLineOptions(argc, argv); +#include "paddle/infrt/host_context/paddle_mlir.h" - mlir::MLIRContext* context = infrt::Global::getMLIRContext(); - auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); +void print_usage() { + std::cout << "Error inputs format, two kinds of inputs are supported:\n"; + std::cout << " [1] ./paddle-mlir-convert $path_to_model_file " + "$path_to_params_file\n"; + std::cout << " [2] ./paddle-mlir-convert $path_to_model_dir(__model__ + " + "params)\n"; +} - module->dump(); - mlir::PassManager pm(context); +bool parse_inputs(int argc, + char** argv, + std::string* model_file_name, + std::string* params_file_name) { + switch (argc) { + case 1: { + print_usage(); + return false; + } + case 2: { + *model_file_name = std::string(argv[1]) + std::string("/__model__"); + *params_file_name = std::string(argv[1]) + std::string("/params"); + return true; + } + case 3: { + *model_file_name = argv[1]; + *params_file_name = argv[2]; + return true; + } + default: { return false; } + } +} - mlir::OpPassManager& phi_pass_manager = pm.nest(); - std::vector valid_places = {{infrt::TargetType::CPU, - infrt::PrecisionType::FLOAT32, - infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); - if (mlir::failed(pm.run(*module))) { - std::cout << "\npass failed!\n" << std::endl; - return 4; +int main(int argc, char** argv) { + std::string model_file_name; + std::string params_file_name; + if (parse_inputs(argc, argv, &model_file_name, ¶ms_file_name)) { + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); } - module->dump(); - return 0; } diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc new file mode 100644 index 0000000000000000000000000000000000000000..1df929895b1c704644a7ef136d939996249eba7f --- /dev/null +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" + +int main(int argc, char** argv) { + static llvm::cl::opt input_file( + llvm::cl::Positional, + llvm::cl::desc("Specify input filename"), + llvm::cl::init("-")); + + llvm::cl::ParseCommandLineOptions(argc, argv); + + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); + context->loadAllAvailableDialects(); + module->dump(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(std::make_unique(valid_places)); + if (mlir::failed(pm.run(*module))) { + std::cout << "\npass failed!\n" << std::endl; + return 4; + } + module->dump(); + return 0; +} diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt index 11304742ecd41345aa45bb4e35e064d9745bc42f..14cbea70ca8415a2c53b0bbfb76750d8ad7354eb 100644 --- a/paddle/infrt/host_context/CMakeLists.txt +++ b/paddle/infrt/host_context/CMakeLists.txt @@ -12,6 +12,7 @@ gather_srcs(infrt_src SRCS function.cc mlir_function_executable.cc mlir_program_executor.cc + paddle_mlir.cc ) cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS}) @@ -21,7 +22,7 @@ cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${ML cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS}) cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS}) -add_executable(paddle-mlir-convert paddle_mlir.cc paddle_mlir_converter.cc) +add_executable(paddle-mlir-convert paddle_mlir_converter.cc) target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS}) add_executable(infrtexec mlir_exec.cc) target_link_libraries(infrtexec infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 7e7d77d3af741443d490dcfdd5b9ee6677b557ef..0ae482349cd07eafda1de5f634ed31a0c7310973 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -70,7 +70,7 @@ using ValueVariantType = backends::CpuPhiAllocator, backends::CpuPhiContext, ::phi::CPUContext, - std::vector, + std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, std::vector, diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt deleted file mode 100755 index 51fecdf907798eb7280a17b294a263fe40993fe2..0000000000000000000000000000000000000000 --- a/paddle/infrt/pass/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(phi) diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index e5cc1ec1121fb7bbff2fad7856151916d8ea0924..5ce6d8673421ba3c53c9dad6d2fd1f20298f837a 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,6 +1,6 @@ configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" - DEPENDS infrtopt infrtexec) + DEPENDS infrtopt infrtexec phi-ir-exec) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir index 30ff2636ae5a41674883e63ff931629a0d140b84..61a66cb3d71a372bcd67cb96362abcb033768e4d 100644 --- a/paddle/infrt/tests/dialect/pten/pten_pass.mlir +++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt %s | FileCheck %s +// RUN: phi-ir-exec %s // CHECK-LABEL: @ops func @ops() { %a = pd.feed() {name="input0"} : !infrt.lod_tensor diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in index d47957dac928409ad4b49884db9c70310b38d9ca..fe35dc4b8b3d436de5ec6893a8927eb49f1ab67d 100644 --- a/paddle/infrt/tests/lit.cfg.py.in +++ b/paddle/infrt/tests/lit.cfg.py.in @@ -23,9 +23,10 @@ config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib") infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/") trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/") infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/") +phi_ir_exec_bin = os.path.join(build_dir, "paddle/infrt/dialect/phi") llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/") config.environment['PATH'] = os.path.pathsep.join( - (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH'])) + (infrtopt_bin, infrtexec_bin, trtexec_bin, phi_ir_exec_bin, llvm_bin, config.environment['PATH'])) config.suffixes = ['.mlir'] diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index f04e74b45fcd42cfeee860b05f52855ec15ef8f6..e1ebe8c6465cfdd7f8213c0a31416bc77412221c 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -71,11 +71,11 @@ paddle::optional MakeMetaTensor( } std::vector MakeMetaTensor( - const std::vector& tensors) { + const std::vector& tensors) { std::vector meta_tensors; meta_tensors.reserve(tensors.size()); - for (const auto& t : tensors) { - meta_tensors.emplace_back(t); + for (const auto* t : tensors) { + meta_tensors.emplace_back(*t); } return meta_tensors; } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index 109c6e7ab71f5f889e63c410ee84aaad6c6b8110..01625f651c3bd1deaae43f735ac03fb2bc3f4e25 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -51,7 +51,7 @@ paddle::optional MakeMetaTensor( const paddle::optional& tensor); std::vector MakeMetaTensor( - const std::vector& tensors); + const std::vector& tensors); phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index f625d57df2ef2dc2f9505853dc5e07e5d9e0022e..688a0e54a0cf4f0f041704b03c5d256a7c17d1ec 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/common/place.h" #include "paddle/utils/any.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 57e2db60c24caea8cbac323d9c47bdb53acc8a8c..213ac47d30bfdd28541bd1b9cb24bf2053b1c939 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -82,12 +82,11 @@ class KernelContext { } template - std::vector MoveInputsBetween(size_t start, size_t end) { - std::vector v; + std::vector InputsBetween(size_t start, size_t end) { + std::vector v; for (size_t i = start; i < end; ++i) { - auto t = static_cast(inputs_.at(i)); - v.emplace_back(*t); - inputs_[i] = nullptr; + auto* t = static_cast(inputs_.at(i)); + v.emplace_back(t); } return v; } diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 2b04d173af06990bbbec91d9a3c6e0929dd814cc..35e170a3fce5636f8d9ef904f6b78cacb66b3792 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -87,8 +87,8 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); - } else if (arg_type == - std::type_index(typeid(const std::vector&))) { + } else if (arg_type == std::type_index(typeid( + const std::vector&))) { args_def->AppendInput(default_key.backend(), default_tensor_layout, default_key.dtype(), diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index b582375155a1878c52fd8fe9fb13f6e715df7067..f7fa27b0744b6a23f9f2d208e2d0c56ec092baed 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -102,26 +102,26 @@ namespace phi { } \ } -#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ - template \ - struct KernelCallHelper&, Tail...> { \ - template \ - static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ - std::vector arg = std::move( \ - ctx->MoveInputsBetween(range.first, range.second)); \ - KernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ +#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ + template \ + struct KernelCallHelper&, Tail...> { \ + template \ + static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::pair range = ctx->InputRangeAt(in_idx); \ + std::vector arg = std::move( \ + ctx->InputsBetween(range.first, range.second)); \ + KernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ } #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ef51d6daf6a0052f39c2cf6253c208412cbb6904..4ffa1826a29fa3904b959a1e8f2fd9ceb27511b4 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h index 5ec2e35cc9b0cfe09fd281605984e72a603b8f5e..5d24f6684a48f2fb4a65673e0bb888a9f37b1246 100644 --- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -21,7 +21,7 @@ namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx); } // namespace phi diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h index fb2a6f1136c26cb1bee1ca26ae7d214566862709..22b5201b6900dd18b645c2d7645adb96a6f11e91 100644 --- a/paddle/phi/kernels/broadcast_tensors_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -21,7 +21,7 @@ namespace phi { template void BroadcastTensorsKernel(const Context& ctx, - const std::vector& x, + const std::vector& x, std::vector out); } // namespace phi diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index f13667881468e15183c3d770df638641f1dc6ed0..ed969e963ec0e4d9a7fe3b2ebc3df2253747df27 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -22,19 +22,19 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis, DenseTensor* out); template DenseTensor Concat(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis) { std::vector meta_x; meta_x.reserve(x.size()); std::vector meta_x_ptr; - for (const auto& t : x) { - meta_x.emplace_back(t); + for (const auto* t : x) { + meta_x.emplace_back(*t); meta_x_ptr.push_back(&meta_x.back()); } diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..339f1c00eaa505cdcf8976abae92a6c93cfd50eb --- /dev/null +++ b/paddle/phi/kernels/conv_grad_grad_kernel.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bad30989ac90d8b46fee99039c2772d00c7d939a --- /dev/null +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..eb0bfdd0275b5050054c620e722b0e7653fd678a --- /dev/null +++ b/paddle/phi/kernels/conv_kernel.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc index 7a97f8c2189736452a722882f8d86a6cfaeae0f5..0869cd62024dc9e11ce1e1a1fc5349c0c966ef9e 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -59,7 +59,7 @@ namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx) { // Find reduce dimensions const auto& in_tensors = dout; @@ -85,7 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, // For each In-Out tensor pair, // Prepare and apply broadcast dims array for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = &in_tensors[i]; + const auto* input_tensor = in_tensors[i]; auto* output_tensor = out_tensors[i]; const auto& input_dims = input_tensor->dims(); diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 5c4202837c4487361f33b849df7d975e85f8490d..6be825d4ef14e8e9aabf9c1b5b804c3ff5a18347 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -29,17 +29,17 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis_scalar, DenseTensor* out) { int64_t axis = axis_scalar.to(); - axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); + axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size()); std::vector x_dims; x_dims.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { - x_dims.push_back(x[i].dims()); + x_dims.push_back(x[i]->dims()); } phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); @@ -47,13 +47,13 @@ void ConcatKernel(const Context& dev_ctx, out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. - if (axis == 0 && x[0].lod().size() > 0) { - size_t lod_size_0 = x[0].lod().size(); + if (axis == 0 && x[0]->lod().size() > 0) { + size_t lod_size_0 = x[0]->lod().size(); size_t lod_size = lod_size_0; for (size_t i = 1; i < x.size(); ++i) { - if (x[i].lod().size() > 0) { + if (x[i]->lod().size() > 0) { PADDLE_ENFORCE_EQ( - x[i].lod().size(), + x[i]->lod().size(), lod_size_0, phi::errors::Unimplemented( "The lod level of all input LoDTensors should be same. " @@ -61,7 +61,7 @@ void ConcatKernel(const Context& dev_ctx, "it is not supported currently. The lod level of %dth input " "is %d and first input is %d.", i, - x[i].lod().size(), + x[i]->lod().size(), lod_size_0)); } else { lod_size = 0; @@ -71,7 +71,7 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod()); + auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod()); phi::AppendLoD(out_lod, in_lod); } } @@ -80,28 +80,29 @@ void ConcatKernel(const Context& dev_ctx, // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && x.size() < 10) { size_t output_offset = 0; - for (auto& in : x) { - if (in.numel() == 0UL) { + for (const auto* in : x) { + if (in->numel() == 0UL) { continue; } - auto in_stride = phi::stride_numel(in.dims()); + auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); paddle::operators::StridedNumelCopyWithAxis( dev_ctx, axis, out->data() + output_offset, out_stride, - in.data(), + in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; } } else { + // TODO(chenweihang): concat functor support vector input std::vector inputs; inputs.reserve(x.size()); for (size_t j = 0; j < x.size(); ++j) { - if (x[j].numel() > 0) { - inputs.emplace_back(x[j]); + if (x[j]->numel() > 0) { + inputs.emplace_back(*x[j]); } else { continue; } diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f157bb017f81c5636a87ddcecb82e977e4fd18ba --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void Conv3DGradGradKernel(const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + CPU, + ALL_LAYOUT, + phi::Conv3DGradGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..994ad861bd15b747524d7e0f47a0ed5b8ee465cd --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0b4ee7d5776fdaf51955b2d35bb339735411a28 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} + +PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h new file mode 100644 index 0000000000000000000000000000000000000000..d26d89086b27e3db8ccfcf339c51c6a2fdf1988a --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_util.h @@ -0,0 +1,91 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/ddim.h" + +namespace phi { + +template +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilation, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& ksize) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ( + data_dims.size() * 2, + paddings->size(), + phi::errors::InvalidArgument( + "Attribute padding's size should be the same or twice as the " + "input's dimension. " + "But recieved: padding's size is %d, padding is [%s]; input's " + "dimension is %d, input's shape is [%s].", + paddings->size(), + make_ddim(*paddings), + data_dims.size(), + data_dims)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + + // dilation + *(dilation->begin() + i) = 1; + } + + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +inline bool IsExpand(const std::vector& filter_dim, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + if (paddings.size() != strides.size()) { + for (size_t j = 0; j < paddings.size(); ++j) { + padding_0 = padding_0 && (paddings[j] == 0); + } + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53 --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53 --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..21ebae8487ffc3588034a8ea5feeab8ac1c47fa8 --- /dev/null +++ b/paddle/phi/kernels/funcs/batch_norm_utils.h @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index e9fd4cf47b834775c03e9b48ff1e3a5096228fb2..aab31cfbd55b64a957ca75840cc6c0bb41e3f8c0 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize( "%d-th output tensor`s shape is not.", i)); out_vec_size = std::min( - paddle::platform::GetVectorizedSize((*outs)[i]->data()), - out_vec_size); + phi::GetVectorizedSize((*outs)[i]->data()), out_vec_size); } } else { - out_vec_size = - paddle::platform::GetVectorizedSize((*outs)[0]->data()); + out_vec_size = phi::GetVectorizedSize((*outs)[0]->data()); } for (auto *in : ins) { - auto temp_size = paddle::platform::GetVectorizedSize(in->data()); + auto temp_size = phi::GetVectorizedSize(in->data()); in_vec_size = in->dims() == (*outs)[0]->dims() ? std::min(temp_size, in_vec_size) : in_vec_size; diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index f0793fb9d27db68f22bc2bc27978844072c61616..3ef39dc55d124b0fca30e44c2e07c5ce4c644a30 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/hostdevice.h" #if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 235dbdd40f6b7db5524251aec80b92cdc22aa819..332ec0b0312da96ca21b2c616440afc57a62edc2 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -23,9 +23,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #define HOSTDEVICE __host__ __device__ @@ -546,9 +546,8 @@ struct VecSizeGetter { const ArgsT &args, int *vec_size) { using Type = std::tuple_element_t; - *vec_size = std::min( - *vec_size, - paddle::platform::GetVectorizedSize(ins[Index]->data())); + *vec_size = std::min(*vec_size, + phi::GetVectorizedSize(ins[Index]->data())); } }; @@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector &ins, // The Arg VecSize=1 is to match the Unroller template. Unroller::step(ins, arg, &vec_size); for (auto iter = outs.begin(); iter != outs.end(); ++iter) { - vec_size = std::min( - vec_size, paddle::platform::GetVectorizedSize((*iter)->data())); + vec_size = + std::min(vec_size, phi::GetVectorizedSize((*iter)->data())); } return vec_size; } diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h index 6d10ff2dfcf39c6b57084e99eb31fc1d888f5f75..e2c4e766b605b570463da12c39c456923c439916 100644 --- a/paddle/phi/kernels/funcs/padding.h +++ b/paddle/phi/kernels/funcs/padding.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include #include -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h new file mode 100644 index 0000000000000000000000000000000000000000..7df772682ecf9d05f77edccdc38d93d4220c6496 --- /dev/null +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -0,0 +1,1240 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// CUDA and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#include +#include +#include +#include +#include + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif + +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" +#include "paddle/utils/string/string_helper.h" + +// Reduce split or not, Whether to use ReduceHigherDim +#define REDUCE_SPLIT_BOUNDARY 512 +#define REDUCE_VEC_SIZE 4 + +namespace kps = phi::kps; + +namespace phi { +namespace funcs { + +namespace details { + +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + +// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny +static inline std::vector GetDimStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +// get blockDim for reduceLastDim and reduceAny +static inline int GetBlockDim(int block_dim) { + return block_dim >= kps::details::kReduceMaxThread + ? kps::details::kReduceMaxThread + : GetLastPow2(block_dim); +} + +// check reduce rand is valid +static inline void CheckReduceRank(int reduce_rank, int rank) { + if (rank % 2 == 0) { + PADDLE_ENFORCE_EQ(reduce_rank, + rank / 2, + phi::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, " + "reduce_rank must be %d, but got %d.", + rank, + rank / 2, + reduce_rank)); + } else { + auto lower_rank = (rank - 1) / 2; + auto upper_rank = (rank + 1) / 2; + PADDLE_ENFORCE_EQ( + reduce_rank == lower_rank || reduce_rank == upper_rank, + true, + phi::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " + "must be %d or %d, but got %d.", + rank, + lower_rank, + upper_rank, + reduce_rank)); + } +} + +// convert dims from vector to array +template +static inline phi::Array VectorToArray( + const VectorLikeType& vec) { + PADDLE_ENFORCE_LE( + vec.size(), + ElementCount, + phi::errors::InvalidArgument("Cub reduce Array: size not match. Received " + "vec.size() %d > ElementCount %d.", + vec.size(), + ElementCount)); + size_t n = static_cast(vec.size()); + phi::Array ret; + for (size_t i = 0; i < n; ++i) { + ret[i] = vec[i]; + } + return ret; +} + +static inline std::vector GetReduceDim(const std::vector& dims, + int dim_size, + bool reduce_all) { + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(dim_size); + int reduce_size = reduce_dims.size(); + for (int i = 0; i < reduce_size; ++i) { + reduce_dims[i] = i; + } + } else { + for (auto e : dims) { + PADDLE_ENFORCE_LT(e, + dim_size, + phi::errors::InvalidArgument( + "ReduceOp: invalid axis, when x_dims is %d, " + "axis[i] should less than x_dims, but got %d.", + dim_size, + e)); + reduce_dims.push_back(e >= 0 ? e : e + dim_size); + } + } + return reduce_dims; +} + +} // namespace details + +constexpr int kMaxRank = phi::DDim::kMaxRank; + +enum ReduceType { + kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; + kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim + kReduceAny = 0x03, // when reduce_dim.size() > 1 +}; + +struct IndexCalculator { + IndexCalculator(int dim, + const std::vector& cal_dims, + const std::vector& cal_strides, + const std::vector& full_strides) + : dim(dim) { + dims = details::VectorToArray(cal_dims); + strides = details::VectorToArray(full_strides); + reduce_strides = details::VectorToArray(cal_strides); +#ifndef PADDLE_WITH_XPU_KP + std::vector cal_divmoders; + // fast divmod + for (auto i : cal_strides) { + cal_divmoders.push_back(paddle::platform::FastDivMod(i)); + } + divmoders = details::VectorToArray( + cal_divmoders); +#endif + } + + __device__ inline int operator()(int offset) const { +#ifdef PADDLE_WITH_XPU_KP + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + index += (offset / reduce_strides[i]) * strides[dims[i]]; + offset = offset % reduce_strides[i]; + } + return index; +#else + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + auto divmod = divmoders[i].Divmod(offset); + index += (divmod.val[0] * strides[dims[i]]); + offset = divmod.val[1]; + } + return index; +#endif + } + + int dim; + phi::Array dims; + phi::Array strides; + phi::Array reduce_strides; +#ifndef PADDLE_WITH_XPU2 + phi::Array divmoders; +#endif +}; + +template +struct ReduceIndexMapping { + const kps::DimConfig dim; + HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) + : dim(dims) {} + + __device__ __forceinline__ int BlockIdX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } else { + return cluster_id() % dim.split_num_x; + } +#else + return blockIdx.x; +#endif + } + + __device__ __forceinline__ int BlockIdY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() % dim.split_num_x); + } else { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } +#else + return blockIdx.y; +#endif + } + + __device__ __forceinline__ int BlockDimX() { +#ifdef PADDLE_WITH_XPU2 + return dim.deal_size_x; +#else + return blockDim.x; +#endif + } + + __device__ __forceinline__ int BlockDimY() { +#ifdef PADDLE_WITH_XPU2 + return 1; +#else + return blockDim.y; +#endif + } + + __device__ __forceinline__ int GridDimX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_y; + } else { + return dim.split_num_x; + } +#else + return gridDim.x; +#endif + } + + __device__ __forceinline__ int GridDimY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_x; + } else { + return dim.split_num_y; + } +#else + return gridDim.y; +#endif + } + + __device__ __forceinline__ int GetLoopSize() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.deal_size_y; + } else { + return dim.deal_size_x; + } +#else + return 1; +#endif + } +}; + +// when reduce_type == kReduceLastDim this struct will be used +// for higher performance +struct OneDimIndexCal { + explicit OneDimIndexCal(int num) : stride(num) {} + + __device__ inline int operator()(int index) const { return index * stride; } + int stride; +}; + +// reduce config +template +struct ReduceConfig { + ReduceConfig(const std::vector& origin_reduce_dims, + const std::vector& origin_x_dim) + : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {} + + // get the parameters of reduceKernel + void Run() { + // step1: update the reduce_dim left_dim and x_dim + SetReduceDim(); + + // step2: get the strides of dim for reduceAny and reduceLastDim + SetStrides(); + + // step3: get the type of reduce + SetReduceType(); + + // step4: set the block and grid for launch kernel + SetBlockDim(); + } + + // when should_reduce_again is true, we need malloc temp space for temp data + void SetOutputData(Ty* y_data, + const phi::GPUContext& dev_ctx, + phi::DenseTensor* tmp) { + if (should_reduce_again) { + tmp->ResizeAndAllocate(phi::make_ddim( + {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); + + output_data = dev_ctx.Alloc(tmp); + } else { + output_data = y_data; + } + } + + private: + // set reduce_dim, left_dim and update x_dim + // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1] + // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] + void SetReduceDim() { + std::set reduce_set; + for (auto e : reduce_dims_origin) { + auto pos = e >= 0 ? e : e + x_dim.size(); + reduce_set.insert(pos); + } + + std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); + std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); + + // update reduce_dim and x_dim + std::vector x_new_dim; + + reduce_dim.push_back(reduce_dim_temp[0]); + x_new_dim.push_back(x_dim[0]); + + int idx_reduce = 1; + int num = 0; + + if (reduce_dim_temp.size() > 1) { + for (int i = 1; i < x_dim.size(); i++) { + if ((idx_reduce < reduce_dim_temp.size()) && + (i == reduce_dim_temp[idx_reduce])) { + int result = + reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; + bool is_equal = ((result - num) == 1); + if (is_equal) { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + num++; + } else { + reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); + x_new_dim.push_back(x_dim[i]); + } + idx_reduce++; + } else { + x_new_dim.push_back(x_dim[i]); + } + } + } else { + x_new_dim = x_dim; + } + + // update x_dim + x_dim = x_new_dim; + std::vector().swap(x_new_dim); + + std::vector reduce_dim_new; + int is_reduced = 0; + for (auto e : reduce_dim) { + is_reduced |= 1 << e; + } + + std::vector().swap(reduce_dim); + + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + x_new_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + reduce_dim_new.push_back(x_new_dim.size() - 1); + } else { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + } + } + + x_dim = x_new_dim; + reduce_dim = reduce_dim_new; + + int x_rank = static_cast(x_dim.size()); + std::set left_set; + + for (int i = 0; i < x_rank; ++i) { + left_set.insert(i); + } + + for (auto e : reduce_dim) { + left_set.erase(e); + } + + left_dim.assign(left_set.begin(), left_set.end()); + + // if the last dim gets involved in reduction + reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1); + } + + // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny + // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1] + // --SetStrides--> x_strides= [6,1], reduce_strides = [1], + // left_strides = [1] + void SetStrides() { + std::vector idx_dim; + for (int i = 0; i < x_dim.size(); i++) { + idx_dim.push_back(i); + } + + x_strides = details::GetDimStrides(x_dim, idx_dim); + reduce_strides = details::GetDimStrides(x_dim, reduce_dim); + left_strides = details::GetDimStrides(x_dim, left_dim); + reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + + left_num = 1; + if (left_dim.size()) { + left_num = left_strides[0] * x_dim[left_dim[0]]; + } + } + + // get the reduceType + // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim + // x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim + // x_dim = [8] reduce_dim = [0] --> reduceAll + // x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny + void SetReduceType() { + int rank = x_dim.size(); + int reduce_rank = reduce_dim.size(); + bool is_last_dim = + (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); + if (rank == reduce_rank || is_last_dim) { +#ifdef PADDLE_WITH_XPU_KP + reduce_type = static_cast(ReduceType::kReduceAny); +#else + reduce_type = static_cast(ReduceType::kReduceLastDim); +#endif + } else if (reduce_rank == 1) { +// ReduceFirstDim and reduceSecondDim +#ifdef PADDLE_WITH_XPU_KP + if (reduce_dim[0] == 0) { + reduce_type = static_cast(ReduceType::kReduceHigherDim); + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } +#else + reduce_type = static_cast(ReduceType::kReduceHigherDim); +#endif + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } + } + +#ifndef PADDLE_WITH_XPU_KP + void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { + constexpr int min_reduce_num_per_thread = 16; + constexpr int max_reduce_num_per_thread = 256; + constexpr int max_num_threads = kps::details::kReduceMaxThread; + + // set block size. + // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same + // will process the reduction for one output. + // The number of output for one block is blockDim.y; + // 2. If reduce_last_dim == false, different threadIdx.x will process + // different reduction and gets the output separately. If it is + // necessary, it should reduce in block y. + // The number of output for one block is blockDim.x; + int block_x, block_y; + int grid_num, reduce_num_per_thread; + if (reduce_last_dim) { + block_x = details::GetBlockDim(reduce_num); + block_y = details::GetBlockDim(left_num); + block_dim->x = block_x; + block_dim->y = + std::min(block_y, static_cast(max_num_threads / block_dim->x)); + grid_num = details::AlignUp(left_num, block_dim->y); + reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x); + } else { + block_x = details::GetBlockDim(left_num); + block_y = details::GetBlockDim(reduce_num); + block_dim->x = std::min(block_x, 32); + block_dim->y = + std::min(block_y, static_cast(max_num_threads / block_dim->x)); + block_dim->x = + std::min(block_x, static_cast(max_num_threads / block_dim->y)); + grid_num = details::AlignUp(left_num, block_dim->x); + reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y); + } + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); + int max_threads_per_mp = + paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + int num_threads = block_dim->x * block_dim->y; + int max_num_blocks = max_threads / num_threads; + + // set grid size. + // Whether to set grid.y larger than 1, there are 3 following rules: + // 1. The number that each thread process should no less than + // min_reduce_num_per_threadbut no more than max_reduce_num_per_thread; + // 2. It should maximize the utilization of SM. + // So we choose the minimum between input_split_num_1 and input_split_num_3 + // to make each thread process as mush data as possible. Meanwhile, + // the number cannot be larger than max_reduce_num_per_thread, so we + // choose the maximum between the result above and input_split_num_2. + int input_split_num_1 = + details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread); + int input_split_num_2 = + details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread); + int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num); + + grid_dim->x = grid_num; + grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3), + input_split_num_2); + // if grid.y > 1, we need launch reduce kernel again. + if (grid_dim->y > 1) { + should_reduce_again = true; + } + } + + // set block and grid for launch kernel + // for ReduceHigherDim: if block is enough -> splite reduce_num + // else init block(32, 1) grid(block_num, 1) + // for others: block(block_num, 1) , grid(left_num, 1) + void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) { + int last_dim_num = x_dim.back(); + // update left_num + int grid_z = left_num / last_dim_num; + left_num = last_dim_num; + grid_dim->z = grid_z; + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); + int max_threads_per_mp = + paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + // init + int num_block = (max_threads / left_num); + block_dim->x = details::GetBlockDim(left_num); + grid_dim->x = details::AlignUp(left_num, block_dim->x); + blocking_size = reduce_num; + + if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { + blocking_size = details::GetLastPow2(reduce_num / num_block); + if (blocking_size <= 1) { + blocking_size = details::GetLastPow2(sqrt(reduce_num)); + } else if (blocking_size * 2 < reduce_num) { + blocking_size *= 2; + } + should_reduce_again = true; + grid_dim->y = details::AlignUp(reduce_num, blocking_size); + } + } +#endif + + void SetBlockDim() { + // init + int block_num = details::GetBlockDim(reduce_num); + should_reduce_again = false; + dim3 block_dim(block_num, 1, 1); + dim3 grid_dim(left_num, 1, 1); + blocking_size = reduce_num; +#ifdef PADDLE_WITH_XPU_KP + if (reduce_last_dim) { + block_dim.x = 64; + block_dim.y = reduce_num; + grid_dim.x = 1; + grid_dim.y = 8; + } else { + block_dim.x = 64; + block_dim.y = left_num; + grid_dim.x = 8; + grid_dim.y = 1; + } +#else + if (reduce_type == ReduceType::kReduceHigherDim) { + SetBlockDimForHigher(&block_dim, &grid_dim); + } else { + SetBlockDimForReduceAny(&block_dim, &grid_dim); + } +#endif + + block = block_dim; + grid = grid_dim; + } + + public: + std::vector reduce_dims_origin; + std::vector reduce_dim; + std::vector x_dim; + std::vector left_dim; + std::vector x_strides; + std::vector left_strides; + std::vector reduce_strides; + + int reduce_type; + int reduce_num; + int left_num; + int blocking_size; + bool should_reduce_again; + bool reduce_last_dim; + + Ty* output_data; + + dim3 block; + dim3 grid; +}; + +// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or +// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this +// function will be used +template +__global__ void ReduceAnyKernel(const Tx* x, + Ty* y, + ReduceOp reducer, + TransformOp transformer, + MPType init, + int reduce_num, + int left_num, + bool reduce_last_dim, + const Calculator reduce_index_calculator, + const Calculator left_index_calculator, + const kps::DimConfig dim) { + int input_idx, left_idx, stride; + int block_size = 0; + bool need_store = true; + int loop_left = 0; + int tid = 0; + // the last dim gets involved in reduction + int store_offset = 0; + int stride_left = 0; + if (reduce_last_dim) { + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimX(); + left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y; + stride = block.GridDimY() * block.BlockDimX(); + block_size = block.BlockDimX(); + need_store = (THREAD_ID_X == 0) && (left_idx < left_num); + store_offset = block.BlockIdY() * left_num + left_idx; + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = 1; + tid = THREAD_ID_X; + } else { + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimY(); + left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X; + stride = block.GridDimY() * block.BlockDimY(); + block_size = block.BlockDimY(); + need_store = (THREAD_ID_Y == 0) && (left_idx < left_num); + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = block.BlockDimX() * block.GridDimX(); + store_offset = block.BlockIdY() * left_num + left_idx; + tid = THREAD_ID_Y; + } + // calculate the offset, means the addr where each thread really start. + // 1. reduce for each thread + MPType input_compute[REDUCE_VEC_SIZE]; + Tx input_reg[REDUCE_VEC_SIZE]; + int input_idx_tmp = input_idx; + for (int i = 0; i < loop_left; i += stride_left) { + int input_offset = left_index_calculator(left_idx + i); + const _ptr_ Tx* input = x + input_offset; + MPType reduce_var = init; + // load REDUCE_VEC_SIZE data once, and then compute + int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; + input_idx = input_idx_tmp; + for (; input_idx + block_size < bound; + input_idx += REDUCE_VEC_SIZE * stride) { + kps::ReadDataReduce, + false>(&input_reg[0], + input, + input_idx, + reduce_index_calculator, + 1, + reduce_num, + 1, + stride, + kps::IdentityFunctor(), + reduce_last_dim); + kps::ElementwiseUnary( + &input_compute[0], &input_reg[0], transformer); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); + } + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce(&input_compute[0], + input, + input_idx, + reduce_index_calculator, + 1, + reduce_num - input_idx, + 1, + stride, + transformer, + reduce_last_dim); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); + + kps::Reduce( + &reduce_var, &reduce_var, reducer, reduce_last_dim); + if (need_store) { + y[store_offset + i] = static_cast(reduce_var); + } + } +} + +template +__global__ void ReduceHigherDimKernel(const Tx* x, + Ty* y, + ReduceOp reducer, + TransformOp transformer, + MPType init, + int reduce_num, + int left_num, + int blocking_size, + const kps::DimConfig dim) { + // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this + // function will be used + auto block = ReduceIndexMapping(dim); + int idy = block.BlockIdY() * blocking_size; + int idx = block.BlockIdX() * block.BlockDimX(); + int idz = BLOCK_ID_Z * left_num; + int stride = dim.split_num_x * dim.deal_size_x; + int size = left_num - dim.rem_x; + int loop_size = min(reduce_num - idy, blocking_size); + int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); + int block_offset = idy * left_num + idz * reduce_num; + const _ptr_ Tx* input = x + block_offset; + Tx reduce_input; + for (; idx < size; idx += stride) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + block.BlockDimX(), + 1, + 1, + left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData( + y + store_offset + idx, &result, block.BlockDimX()); + } + + if (idx < left_num) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + dim.rem_x, + 1, + 1, + left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData( + y + store_offset + idx, &result, dim.rem_x); + } +} + +template +static void LaunchReduceKernel(const Tx* x_data, + Ty* y_data, + const ReduceOp& reducer, + const TransformOp& transform, + MPType init, + KPStream stream, + ReduceConfig config) { + if (config.reduce_type == kReduceLastDim) { + int stride_reduce = 1; + int stride_left = config.reduce_num; + // for higher performance + auto reduce_index_calculator = OneDimIndexCal(stride_reduce); + auto left_index_calculator = OneDimIndexCal(stride_left); + + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.block.y, + 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceAnyKernel<<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#else + ReduceAnyKernel<<>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#endif + + } else { + int reduce_rank = config.reduce_strides.size(); + int left_rank = config.left_strides.size(); + auto reduce_index_calculator = IndexCalculator(reduce_rank, + config.reduce_dim, + config.reduce_strides, + config.x_strides); + auto left_index_calculator = IndexCalculator( + left_rank, config.left_dim, config.left_strides, config.x_strides); + + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.block.y, + 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceAnyKernel<<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#else + ReduceAnyKernel<<>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#endif + } + + if (config.should_reduce_again) { + dim3 block; + dim3 grid; + if (config.reduce_last_dim) { + block = dim3(32, 1, 1); + grid = dim3(details::AlignUp(config.left_num, 32), 1, 1); + } else { + block = dim3(config.block.x, 1, 1); + grid = dim3(config.grid.x, 1, config.grid.z); + } + + auto last_index = OneDimIndexCal(1); + auto first_index = OneDimIndexCal(config.left_num); + kps::DimConfig dim = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim.SetRem(config.left_num % block.x, 0, 0); +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(), + init, + config.grid.y, + config.left_num, + config.grid.y, + dim); +#else + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(), + init, + config.grid.y, + config.left_num, + config.grid.y, + dim); +#endif + } +} + +template class ReduceOp, + typename TransformOp> +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const phi::GPUContext& dev_ctx, + KPStream stream) { + auto reducer = ReduceOp(); + cub::TransformInputIterator trans_x(x_data, + transform); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, + temp_storage_bytes, + trans_x, + y_data, + reduce_num, + reducer, + reducer.initial(), + stream); + phi::DenseTensor tmp = + phi::Empty(dev_ctx, {static_cast(temp_storage_bytes)}); + + auto* temp_storage = dev_ctx.Alloc(&tmp); + + cub::DeviceReduce::Reduce(temp_storage, + temp_storage_bytes, + trans_x, + y_data, + reduce_num, + reducer, + reducer.initial(), + stream); +} + +template class ReduceOp, + typename TransformOp> +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const phi::GPUContext& dev_ctx, + KPStream stream) { + PADDLE_THROW(phi::errors::InvalidArgument( + "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); +} + +template class ReduceOp, + typename TransformOp> +void TensorReduceImpl(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + phi::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims, + KPStream stream) { + dev_ctx.Alloc(y); + + auto x_dim = phi::vectorize(x.dims()); + auto config = ReduceConfig(origin_reduce_dims, x_dim); + config.Run(); + int numel = x.numel(); + // after config.run() + // SetOutputData for ReduceHigherDim when should_reduce_again is true, + // temp_output should be stored temp_data in output_data space or stored in + // y_data; + + phi::DDim tmp_ddim; + phi::DenseTensor tmp = phi::Empty(dev_ctx); + + auto x_data = x.data(); + auto y_data = y->data(); + + if (config.reduce_num == 1) { + std::vector inputs = {&x}; + std::vector outputs = {y}; + funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); + return; + } + + config.SetOutputData(y_data, dev_ctx, &tmp); + constexpr bool kIsTxFP16 = std::is_same::value; + bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; +#ifndef PADDLE_WITH_XPU_KP + if (use_cub_reduce) { + CubTensorReduceImpl( + x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + return; + } +#endif + + using MPType = typename kps::details::MPTypeTrait::Type; + auto reducer = ReduceOp(); + // launch ReduceHigherDimKernel + // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this + // function will be used + // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 + // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / + // 32 + // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 + if (config.reduce_type == ReduceType::kReduceHigherDim) { + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.blocking_size, + 0); + dim.SetRem(config.left_num % config.block.x, + config.reduce_num % config.blocking_size, + 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel, + TransformOp><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); +#else + ReduceHigherDimKernel< + Tx, + Ty, + MPType, + ReduceOp, + TransformOp><<>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); +#endif + + if (config.should_reduce_again) { + dim3 block = dim3(config.block.x, 1, 1); + dim3 grid = dim3(config.grid.x, 1, config.grid.z); + kps::DimConfig dim2 = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim2.SetRem(config.left_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(config.grid.y), + reducer.initial(), + config.grid.y, + config.left_num, + config.grid.y, + dim2); +#else + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(config.grid.y), + reducer.initial(), + config.grid.y, + config.left_num, + config.grid.y, + dim2); +#endif + } + return; + } + + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or + // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this + // function will be used + LaunchReduceKernel, TransformOp>( + x_data, y_data, reducer, transform, reducer.initial(), stream, config); +} + +} // namespace funcs + +} // namespace phi + +#endif diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index 2b6140d2fde0d3bcef3f15c4414444f1d2099b2e..79d8a7b0f3444b4272d1affd67bd5ac943f2c1cc 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/bernoulli_kernel.h" + #include #include #ifdef __NVCC__ @@ -28,7 +30,6 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/bernoulli_kernel.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 6fb24d72145c67be2ad1d25620e7886326e8cd6f..926dffc7450dc6db0ff9d2384e92a9ece374026c 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -20,14 +20,14 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx) { // Find reduce dimensions const auto& in_tensors = dout; @@ -54,7 +54,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, // For each In-Out tensor pair, // Prepare and apply broadcast dims array for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = &in_tensors[i]; + auto* input_tensor = in_tensors[i]; auto* output_tensor = out_tensors[i]; const DDim& input_dims = input_tensor->dims(); @@ -87,7 +87,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, *input_tensor, ctx.GetPlace(), ctx, output_tensor); } else { // reduce_sum implementation on CUDA - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, *input_tensor, output_tensor, diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index 569a46f56d5638584262c0d1c8002459fa8ffd70..542234c80b5a1e945aec7c8342d31ef9b676cce8 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -20,11 +20,11 @@ #include "paddle/phi/kernels/funcs/elementwise_base.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu index 272448504acd7a1c8248b67e0560cc359516ba6a..9c02627e5463b125076d86daef1b52fe4502a7e0 100644 --- a/paddle/phi/kernels/gpu/compare_kernel.cu +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -80,7 +80,7 @@ inline void CompareAllKernelImpl(const Context& ctx, for (int i = 0; i < reduce_dims.size(); ++i) { reduce_dims[i] = i; } - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); } diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 2b04b979c20aa71cc723610d013cd12fb5537a29..accb1cc3d77e3ccd14b4d7808b781cf255eddd06 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -29,16 +29,16 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis_scalar, DenseTensor* out) { int64_t axis = axis_scalar.to(); - axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); + axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size()); std::vector x_dims; for (size_t i = 0; i < x.size(); ++i) { - x_dims.push_back(x[i].dims()); + x_dims.push_back(x[i]->dims()); } phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); @@ -46,13 +46,13 @@ void ConcatKernel(const Context& dev_ctx, out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. - if (axis == 0 && x[0].lod().size() > 0) { - size_t lod_size_0 = x[0].lod().size(); + if (axis == 0 && x[0]->lod().size() > 0) { + size_t lod_size_0 = x[0]->lod().size(); size_t lod_size = lod_size_0; for (size_t i = 1; i < x.size(); ++i) { - if (x[i].lod().size() > 0) { + if (x[i]->lod().size() > 0) { PADDLE_ENFORCE_EQ( - x[i].lod().size(), + x[i]->lod().size(), lod_size_0, phi::errors::Unimplemented( "The lod level of all input LoDTensors should be same. " @@ -60,7 +60,7 @@ void ConcatKernel(const Context& dev_ctx, "it is not supported currently. The lod level of %dth input " "is %d and first input is %d.", i, - x[i].lod().size(), + x[i]->lod().size(), lod_size_0)); } else { lod_size = 0; @@ -70,7 +70,7 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod()); + auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod()); phi::AppendLoD(out_lod, in_lod); } } @@ -79,18 +79,18 @@ void ConcatKernel(const Context& dev_ctx, // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && x.size() < 10) { size_t output_offset = 0; - for (auto& in : x) { - if (in.numel() == 0UL) { + for (auto* in : x) { + if (in->numel() == 0UL) { continue; } - auto in_stride = phi::stride_numel(in.dims()); + auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); paddle::operators::StridedNumelCopyWithAxis( dev_ctx, axis, out->data() + output_offset, out_stride, - in.data(), + in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; @@ -98,8 +98,8 @@ void ConcatKernel(const Context& dev_ctx, } else { std::vector inputs; for (size_t j = 0; j < x.size(); ++j) { - if (x[j].numel() > 0) { - inputs.push_back(x[j]); + if (x[j]->numel() > 0) { + inputs.push_back(*x[j]); } else { continue; } diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6449a193a082e5d17926de9252a79e4c069be224 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4df7bb26adf845b4a8f52f1c92beb8621002b3da --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..680ee4426af0661d39d4c7bd0abf9e52c4594995 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_test_kernel.cu b/paddle/phi/kernels/gpu/conv_test_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0544a1e298b8e7dc871d13f546398a5c28308b0e --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h similarity index 62% rename from paddle/fluid/operators/math/depthwise_conv.cu rename to paddle/phi/kernels/gpu/depthwise_conv.h index a4665a8f9a62dde6bfdbad3b05d7065e05f0a92f..5270a4b2fdb8d77aa1dfb20a166a9676b007c93f 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/hostdevice.h" + #ifdef __NVCC__ #include #endif @@ -21,7 +25,7 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math/depthwise_conv.h" + #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -30,6 +34,58 @@ namespace paddle { namespace operators { namespace math { +using DataLayout = framework::DataLayout; + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* input_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* filter_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + template static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) { typedef cub::WarpReduce WarpReduce; @@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; @@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { } if (c_filter == -1) { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNCHW(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } else { - KernelDepthwiseConvNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNHWC(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvCFilterNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } else { KernelDepthwiseConvCFilterNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } } @@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvInputGradSp( ARG_DEFINE_KernelDepthwiseConvInputGrad) { int final_filter_multiplier = filter_multiplier; @@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvInputGradNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { KernelDepthwiseConvInputGradNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvInputGradCFilterNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { - KernelDepthwiseConvInputGradCFilterNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } } @@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp( // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { T s = 0; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; @@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( template __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int bid = blockIdx.z; int image_h = blockIdx.y; int kernel_iw = blockIdx.x % filter_width; @@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( template __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { const int bid = blockIdx.z; int image_h = blockIdx.x * dilate_height + blockIdx.y; if (image_h >= output_height) { @@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( } } -template -__global__ void KernelDepthwiseConvFilterGradSp( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { +template +__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; int w_stride = stride_width; @@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { KernelDepthwiseConvFilterGradNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { - KernelDepthwiseConvFilterGradCFilterNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } @@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp( * height and width, respectively. */ template -class DepthwiseConvFunctor { +class DepthwiseConvFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, + const std::vector& dilations, + framework::Tensor* output, const DataLayout data_layout = DataLayout::kNCHW) { const int batch_size = input.dims()[0]; const int input_channels = @@ -905,12 +1199,14 @@ class DepthwiseConvFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } else { \ - KernelDepthwiseConvSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ - fuse_relu_before_conv><<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (c_filter == -1) { \ + threads.x = block_size; \ + grid.x = grid_size; \ + threads.y = threads.z = grid.y = grid.z = 1; \ + } \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } else { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } \ + return; \ } check_case(1, 1, 3); check_case(1, 1, 5); @@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor -class DepthwiseConvInputGradFunctor { +class DepthwiseConvInputGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const framework::Tensor& output_grad, @@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } else { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ fuse_relu_before_conv><<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } \ return; \ } @@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor -class DepthwiseConvFilterGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& output_grad, const std::vector& strides, @@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ } else { \ framework::Tensor filter_grad_hwc; \ if (c_filter != -1) { \ - framework::DDim filter_grad_hwc_dims( \ - {filter_grad->dims()[2], filter_grad->dims()[3], \ - filter_grad->dims()[0], filter_grad->dims()[1]}); \ + framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2], \ + filter_grad->dims()[3], \ + filter_grad->dims()[0], \ + filter_grad->dims()[1]}); \ filter_grad_hwc.Resize(filter_grad_hwc_dims); \ filter_grad_hwc.mutable_data(context.GetPlace()); \ - phi::funcs::SetConstant set_zero; \ + phi::funcs::SetConstant set_zero; \ set_zero(context, &filter_grad_hwc, static_cast(0)); \ filter_grad_data = filter_grad_hwc.data(); \ } else { \ @@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ if (c_filter != -1) { \ std::vector perm_axis({2, 3, 0, 1}); \ - phi::funcs::TransposeNormal trans; \ + phi::funcs::TransposeNormal trans; \ trans(context, filter_grad_hwc, filter_grad, perm_axis); \ } \ } \ @@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f27b6fde99ffa652c734d363ad7731f75b495f4 --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* output_grad = &out_grad; + + if (!input_grad && !filter_grad) return; + + std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + phi::funcs::SetConstant set_zero; + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c50ceae33fc790a763e02bed9b6bed4879b2c547 --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/operators/conv_op.h" + +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +namespace phi { + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + DenseTensor* output = out; + output->mutable_data(dev_ctx.GetPlace()); + + const std::vector strides = strides_t; + std::vector dilations = dilations_t; + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + if (channel_last) { + PADDLE_ENFORCE_EQ( + output->dims()[output->dims().size() - 1] % + input.dims()[input.dims().size() - 1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[output->dims().size() - 1], + input.dims()[input.dims().size() - 1])); + } else { + PADDLE_ENFORCE_EQ( + output->dims()[1] % input.dims()[1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[1], + input.dims()[1])); + } + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index b17196b6b11566927a02b81aa7447ba77c1884ff..20799f4e37b3bda787f54ed3687120696e0d4537 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_grad_base.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { @@ -84,7 +84,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -99,7 +99,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -197,7 +197,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -218,7 +218,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 94c2e980e36a1c6e1f7af3a92da6a7c0f0ed291c..0319de7558e824926e37469a3c222c1f9a9673fc 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -17,1229 +17,9 @@ // CUDA and HIP use same api #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include -#include -#include -#include - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif - -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/utils/array.h" -#include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/primitive/kernel_primitives.h" - -// Reduce split or not, Whether to use ReduceHigherDim -#define REDUCE_SPLIT_BOUNDARY 512 -#define REDUCE_VEC_SIZE 4 - -namespace kps = phi::kps; +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { -namespace kernels { - -namespace details { - -static inline int GetLastPow2(int n) { - n |= (n >> 1); - n |= (n >> 2); - n |= (n >> 4); - n |= (n >> 8); - n |= (n >> 16); - return std::max(1, n - (n >> 1)); -} - -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } - -// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny -static inline std::vector GetDimStrides(const std::vector& dims, - const std::vector& idx) { - int n = static_cast(idx.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[idx[i + 1]]; - } - return strides; -} - -// get blockDim for reduceLastDim and reduceAny -static inline int GetBlockDim(int block_dim) { - return block_dim >= kps::details::kReduceMaxThread - ? kps::details::kReduceMaxThread - : GetLastPow2(block_dim); -} - -// check reduce rand is valid -static inline void CheckReduceRank(int reduce_rank, int rank) { - if (rank % 2 == 0) { - PADDLE_ENFORCE_EQ(reduce_rank, - rank / 2, - phi::errors::InvalidArgument( - "ReduceOp: invalid reduce rank. When rank = %d, " - "reduce_rank must be %d, but got %d.", - rank, - rank / 2, - reduce_rank)); - } else { - auto lower_rank = (rank - 1) / 2; - auto upper_rank = (rank + 1) / 2; - PADDLE_ENFORCE_EQ( - reduce_rank == lower_rank || reduce_rank == upper_rank, - true, - phi::errors::InvalidArgument( - "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " - "must be %d or %d, but got %d.", - rank, - lower_rank, - upper_rank, - reduce_rank)); - } -} - -// convert dims from vector to array -template -static inline phi::Array VectorToArray( - const VectorLikeType& vec) { - PADDLE_ENFORCE_LE( - vec.size(), - ElementCount, - phi::errors::InvalidArgument("Cub reduce Array: size not match. Received " - "vec.size() %d > ElementCount %d.", - vec.size(), - ElementCount)); - size_t n = static_cast(vec.size()); - phi::Array ret; - for (size_t i = 0; i < n; ++i) { - ret[i] = vec[i]; - } - return ret; -} - -static inline std::vector GetReduceDim(const std::vector& dims, - int dim_size, - bool reduce_all) { - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(dim_size); - int reduce_size = reduce_dims.size(); - for (int i = 0; i < reduce_size; ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - PADDLE_ENFORCE_LT(e, - dim_size, - phi::errors::InvalidArgument( - "ReduceOp: invalid axis, when x_dims is %d, " - "axis[i] should less than x_dims, but got %d.", - dim_size, - e)); - reduce_dims.push_back(e >= 0 ? e : e + dim_size); - } - } - return reduce_dims; -} - -} // namespace details - -constexpr int kMaxRank = phi::DDim::kMaxRank; - -enum ReduceType { - kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; - kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim - kReduceAny = 0x03, // when reduce_dim.size() > 1 -}; - -struct IndexCalculator { - IndexCalculator(int dim, - const std::vector& cal_dims, - const std::vector& cal_strides, - const std::vector& full_strides) - : dim(dim) { - dims = details::VectorToArray(cal_dims); - strides = details::VectorToArray(full_strides); - reduce_strides = details::VectorToArray(cal_strides); -#ifndef PADDLE_WITH_XPU_KP - std::vector cal_divmoders; - // fast divmod - for (auto i : cal_strides) { - cal_divmoders.push_back(paddle::platform::FastDivMod(i)); - } - divmoders = details::VectorToArray( - cal_divmoders); -#endif - } - - __device__ inline int operator()(int offset) const { -#ifdef PADDLE_WITH_XPU_KP - int index = 0; -#pragma unroll - for (int i = 0; i < kMaxRank; ++i) { - if (i == dim) { - break; - } - index += (offset / reduce_strides[i]) * strides[dims[i]]; - offset = offset % reduce_strides[i]; - } - return index; -#else - int index = 0; -#pragma unroll - for (int i = 0; i < kMaxRank; ++i) { - if (i == dim) { - break; - } - auto divmod = divmoders[i].Divmod(offset); - index += (divmod.val[0] * strides[dims[i]]); - offset = divmod.val[1]; - } - return index; -#endif - } - - int dim; - phi::Array dims; - phi::Array strides; - phi::Array reduce_strides; -#ifndef PADDLE_WITH_XPU2 - phi::Array divmoders; -#endif -}; - -template -struct ReduceIndexMapping { - const kps::DimConfig dim; - HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) - : dim(dims) {} - - __device__ __forceinline__ int BlockIdX() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return (cluster_id() / dim.split_num_x % dim.split_num_y); - } else { - return cluster_id() % dim.split_num_x; - } -#else - return blockIdx.x; -#endif - } - - __device__ __forceinline__ int BlockIdY() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return (cluster_id() % dim.split_num_x); - } else { - return (cluster_id() / dim.split_num_x % dim.split_num_y); - } -#else - return blockIdx.y; -#endif - } - - __device__ __forceinline__ int BlockDimX() { -#ifdef PADDLE_WITH_XPU2 - return dim.deal_size_x; -#else - return blockDim.x; -#endif - } - - __device__ __forceinline__ int BlockDimY() { -#ifdef PADDLE_WITH_XPU2 - return 1; -#else - return blockDim.y; -#endif - } - - __device__ __forceinline__ int GridDimX() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.split_num_y; - } else { - return dim.split_num_x; - } -#else - return gridDim.x; -#endif - } - - __device__ __forceinline__ int GridDimY() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.split_num_x; - } else { - return dim.split_num_y; - } -#else - return gridDim.y; -#endif - } - - __device__ __forceinline__ int GetLoopSize() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.deal_size_y; - } else { - return dim.deal_size_x; - } -#else - return 1; -#endif - } -}; - -// when reduce_type == kReduceLastDim this struct will be used -// for higher performance -struct OneDimIndexCal { - explicit OneDimIndexCal(int num) : stride(num) {} - - __device__ inline int operator()(int index) const { return index * stride; } - int stride; -}; - -// reduce config -template -struct ReduceConfig { - ReduceConfig(const std::vector& origin_reduce_dims, - const std::vector& origin_x_dim) - : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {} - - // get the parameters of reduceKernel - void Run() { - // step1: update the reduce_dim left_dim and x_dim - SetReduceDim(); - - // step2: get the strides of dim for reduceAny and reduceLastDim - SetStrides(); - - // step3: get the type of reduce - SetReduceType(); - - // step4: set the block and grid for launch kernel - SetBlockDim(); - } - - // when should_reduce_again is true, we need malloc temp space for temp data - void SetOutputData(Ty* y_data, - const paddle::platform::Place& place, - phi::DenseTensor* tmp) { - if (should_reduce_again) { - tmp->ResizeAndAllocate(phi::make_ddim( - {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); - output_data = tmp->mutable_data(place); - } else { - output_data = y_data; - } - } - - private: - // set reduce_dim, left_dim and update x_dim - // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1] - // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] - void SetReduceDim() { - std::set reduce_set; - for (auto e : reduce_dims_origin) { - auto pos = e >= 0 ? e : e + x_dim.size(); - reduce_set.insert(pos); - } - - std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); - std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); - - // update reduce_dim and x_dim - std::vector x_new_dim; - - reduce_dim.push_back(reduce_dim_temp[0]); - x_new_dim.push_back(x_dim[0]); - - int idx_reduce = 1; - int num = 0; - - if (reduce_dim_temp.size() > 1) { - for (int i = 1; i < x_dim.size(); i++) { - if ((idx_reduce < reduce_dim_temp.size()) && - (i == reduce_dim_temp[idx_reduce])) { - int result = - reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; - bool is_equal = ((result - num) == 1); - if (is_equal) { - x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; - num++; - } else { - reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); - x_new_dim.push_back(x_dim[i]); - } - idx_reduce++; - } else { - x_new_dim.push_back(x_dim[i]); - } - } - } else { - x_new_dim = x_dim; - } - - // update x_dim - x_dim = x_new_dim; - std::vector().swap(x_new_dim); - - std::vector reduce_dim_new; - int is_reduced = 0; - for (auto e : reduce_dim) { - is_reduced |= 1 << e; - } - - std::vector().swap(reduce_dim); - - for (int i = 0; i < x_dim.size(); i++) { - if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { - x_new_dim.push_back(x_dim[i]); - if ((is_reduced >> i) & 1) - reduce_dim_new.push_back(x_new_dim.size() - 1); - } else { - x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; - } - } - - x_dim = x_new_dim; - reduce_dim = reduce_dim_new; - - int x_rank = static_cast(x_dim.size()); - std::set left_set; - - for (int i = 0; i < x_rank; ++i) { - left_set.insert(i); - } - - for (auto e : reduce_dim) { - left_set.erase(e); - } - - left_dim.assign(left_set.begin(), left_set.end()); - - // if the last dim gets involved in reduction - reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1); - } - - // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny - // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1] - // --SetStrides--> x_strides= [6,1], reduce_strides = [1], - // left_strides = [1] - void SetStrides() { - std::vector idx_dim; - for (int i = 0; i < x_dim.size(); i++) { - idx_dim.push_back(i); - } - - x_strides = details::GetDimStrides(x_dim, idx_dim); - reduce_strides = details::GetDimStrides(x_dim, reduce_dim); - left_strides = details::GetDimStrides(x_dim, left_dim); - reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; - - left_num = 1; - if (left_dim.size()) { - left_num = left_strides[0] * x_dim[left_dim[0]]; - } - } - - // get the reduceType - // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim - // x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim - // x_dim = [8] reduce_dim = [0] --> reduceAll - // x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny - void SetReduceType() { - int rank = x_dim.size(); - int reduce_rank = reduce_dim.size(); - bool is_last_dim = - (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); - if (rank == reduce_rank || is_last_dim) { -#ifdef PADDLE_WITH_XPU_KP - reduce_type = static_cast(ReduceType::kReduceAny); -#else - reduce_type = static_cast(ReduceType::kReduceLastDim); -#endif - } else if (reduce_rank == 1) { -// ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU_KP - if (reduce_dim[0] == 0) { - reduce_type = static_cast(ReduceType::kReduceHigherDim); - } else { - reduce_type = static_cast(ReduceType::kReduceAny); - } -#else - reduce_type = static_cast(ReduceType::kReduceHigherDim); -#endif - } else { - reduce_type = static_cast(ReduceType::kReduceAny); - } - } - -#ifndef PADDLE_WITH_XPU_KP - void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { - constexpr int min_reduce_num_per_thread = 16; - constexpr int max_reduce_num_per_thread = 256; - constexpr int max_num_threads = kps::details::kReduceMaxThread; - - // set block size. - // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same - // will process the reduction for one output. - // The number of output for one block is blockDim.y; - // 2. If reduce_last_dim == false, different threadIdx.x will process - // different reduction and gets the output separately. If it is - // necessary, it should reduce in block y. - // The number of output for one block is blockDim.x; - int block_x, block_y; - int grid_num, reduce_num_per_thread; - if (reduce_last_dim) { - block_x = details::GetBlockDim(reduce_num); - block_y = details::GetBlockDim(left_num); - block_dim->x = block_x; - block_dim->y = - std::min(block_y, static_cast(max_num_threads / block_dim->x)); - grid_num = details::AlignUp(left_num, block_dim->y); - reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x); - } else { - block_x = details::GetBlockDim(left_num); - block_y = details::GetBlockDim(reduce_num); - block_dim->x = std::min(block_x, 32); - block_dim->y = - std::min(block_y, static_cast(max_num_threads / block_dim->x)); - block_dim->x = - std::min(block_x, static_cast(max_num_threads / block_dim->y)); - grid_num = details::AlignUp(left_num, block_dim->x); - reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y); - } - int device_id = paddle::platform::GetCurrentDeviceId(); - int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); - int max_threads_per_mp = - paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); - int max_threads = max_threads_per_mp * max_mp; - int num_threads = block_dim->x * block_dim->y; - int max_num_blocks = max_threads / num_threads; - - // set grid size. - // Whether to set grid.y larger than 1, there are 3 following rules: - // 1. The number that each thread process should no less than - // min_reduce_num_per_threadbut no more than max_reduce_num_per_thread; - // 2. It should maximize the utilization of SM. - // So we choose the minimum between input_split_num_1 and input_split_num_3 - // to make each thread process as mush data as possible. Meanwhile, - // the number cannot be larger than max_reduce_num_per_thread, so we - // choose the maximum between the result above and input_split_num_2. - int input_split_num_1 = - details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread); - int input_split_num_2 = - details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread); - int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num); - - grid_dim->x = grid_num; - grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3), - input_split_num_2); - // if grid.y > 1, we need launch reduce kernel again. - if (grid_dim->y > 1) { - should_reduce_again = true; - } - } - - // set block and grid for launch kernel - // for ReduceHigherDim: if block is enough -> splite reduce_num - // else init block(32, 1) grid(block_num, 1) - // for others: block(block_num, 1) , grid(left_num, 1) - void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) { - int last_dim_num = x_dim.back(); - // update left_num - int grid_z = left_num / last_dim_num; - left_num = last_dim_num; - grid_dim->z = grid_z; - int device_id = paddle::platform::GetCurrentDeviceId(); - int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); - int max_threads_per_mp = - paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); - int max_threads = max_threads_per_mp * max_mp; - // init - int num_block = (max_threads / left_num); - block_dim->x = details::GetBlockDim(left_num); - grid_dim->x = details::AlignUp(left_num, block_dim->x); - blocking_size = reduce_num; - - if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - blocking_size = details::GetLastPow2(reduce_num / num_block); - if (blocking_size <= 1) { - blocking_size = details::GetLastPow2(sqrt(reduce_num)); - } else if (blocking_size * 2 < reduce_num) { - blocking_size *= 2; - } - should_reduce_again = true; - grid_dim->y = details::AlignUp(reduce_num, blocking_size); - } - } -#endif - - void SetBlockDim() { - // init - int block_num = details::GetBlockDim(reduce_num); - should_reduce_again = false; - dim3 block_dim(block_num, 1, 1); - dim3 grid_dim(left_num, 1, 1); - blocking_size = reduce_num; -#ifdef PADDLE_WITH_XPU_KP - if (reduce_last_dim) { - block_dim.x = 64; - block_dim.y = reduce_num; - grid_dim.x = 1; - grid_dim.y = 8; - } else { - block_dim.x = 64; - block_dim.y = left_num; - grid_dim.x = 8; - grid_dim.y = 1; - } -#else - if (reduce_type == ReduceType::kReduceHigherDim) { - SetBlockDimForHigher(&block_dim, &grid_dim); - } else { - SetBlockDimForReduceAny(&block_dim, &grid_dim); - } -#endif - - block = block_dim; - grid = grid_dim; - } - - public: - std::vector reduce_dims_origin; - std::vector reduce_dim; - std::vector x_dim; - std::vector left_dim; - std::vector x_strides; - std::vector left_strides; - std::vector reduce_strides; - - int reduce_type; - int reduce_num; - int left_num; - int blocking_size; - bool should_reduce_again; - bool reduce_last_dim; - - Ty* output_data; - - dim3 block; - dim3 grid; -}; - -// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or -// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this -// function will be used -template -__global__ void ReduceAnyKernel(const Tx* x, - Ty* y, - ReduceOp reducer, - TransformOp transformer, - MPType init, - int reduce_num, - int left_num, - bool reduce_last_dim, - const Calculator reduce_index_calculator, - const Calculator left_index_calculator, - const kps::DimConfig dim) { - int input_idx, left_idx, stride; - int block_size = 0; - bool need_store = true; - int loop_left = 0; - int tid = 0; - // the last dim gets involved in reduction - int store_offset = 0; - int stride_left = 0; - if (reduce_last_dim) { - auto block = ReduceIndexMapping(dim); - input_idx = block.BlockIdY() * block.BlockDimX(); - left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y; - stride = block.GridDimY() * block.BlockDimX(); - block_size = block.BlockDimX(); - need_store = (THREAD_ID_X == 0) && (left_idx < left_num); - store_offset = block.BlockIdY() * left_num + left_idx; - loop_left = min(block.GetLoopSize(), left_num - left_idx); - stride_left = 1; - tid = THREAD_ID_X; - } else { - auto block = ReduceIndexMapping(dim); - input_idx = block.BlockIdY() * block.BlockDimY(); - left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X; - stride = block.GridDimY() * block.BlockDimY(); - block_size = block.BlockDimY(); - need_store = (THREAD_ID_Y == 0) && (left_idx < left_num); - loop_left = min(block.GetLoopSize(), left_num - left_idx); - stride_left = block.BlockDimX() * block.GridDimX(); - store_offset = block.BlockIdY() * left_num + left_idx; - tid = THREAD_ID_Y; - } - // calculate the offset, means the addr where each thread really start. - // 1. reduce for each thread - MPType input_compute[REDUCE_VEC_SIZE]; - Tx input_reg[REDUCE_VEC_SIZE]; - int input_idx_tmp = input_idx; - for (int i = 0; i < loop_left; i += stride_left) { - int input_offset = left_index_calculator(left_idx + i); - const _ptr_ Tx* input = x + input_offset; - MPType reduce_var = init; - // load REDUCE_VEC_SIZE data once, and then compute - int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; - input_idx = input_idx_tmp; - for (; input_idx + block_size < bound; - input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce, - false>(&input_reg[0], - input, - input_idx, - reduce_index_calculator, - 1, - reduce_num, - 1, - stride, - kps::IdentityFunctor(), - reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce(&input_compute[0], - input, - input_idx, - reduce_index_calculator, - 1, - reduce_num - input_idx, - 1, - stride, - transformer, - reduce_last_dim); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - - kps::Reduce( - &reduce_var, &reduce_var, reducer, reduce_last_dim); - if (need_store) { - y[store_offset + i] = static_cast(reduce_var); - } - } -} - -template -__global__ void ReduceHigherDimKernel(const Tx* x, - Ty* y, - ReduceOp reducer, - TransformOp transformer, - MPType init, - int reduce_num, - int left_num, - int blocking_size, - const kps::DimConfig dim) { - // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this - // function will be used - auto block = ReduceIndexMapping(dim); - int idy = block.BlockIdY() * blocking_size; - int idx = block.BlockIdX() * block.BlockDimX(); - int idz = BLOCK_ID_Z * left_num; - int stride = dim.split_num_x * dim.deal_size_x; - int size = left_num - dim.rem_x; - int loop_size = min(reduce_num - idy, blocking_size); - int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); - int block_offset = idy * left_num + idz * reduce_num; - const _ptr_ Tx* input = x + block_offset; - Tx reduce_input; - for (; idx < size; idx += stride) { - MPType reduce_var = init; - MPType reduce_compute = init; - for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { - kps::ReadData(&reduce_input, - input + loop_idx * left_num + idx, - block.BlockDimX(), - 1, - 1, - left_num); - kps::ElementwiseUnary( - &reduce_compute, &reduce_input, transformer); - kps::Reduce( - &reduce_var, &reduce_compute, reducer, false); - } - Ty result = static_cast(reduce_var); - kps::WriteData( - y + store_offset + idx, &result, block.BlockDimX()); - } - - if (idx < left_num) { - MPType reduce_var = init; - MPType reduce_compute = init; - for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { - kps::ReadData(&reduce_input, - input + loop_idx * left_num + idx, - dim.rem_x, - 1, - 1, - left_num); - kps::ElementwiseUnary( - &reduce_compute, &reduce_input, transformer); - kps::Reduce( - &reduce_var, &reduce_compute, reducer, false); - } - Ty result = static_cast(reduce_var); - kps::WriteData( - y + store_offset + idx, &result, dim.rem_x); - } -} - -template -static void LaunchReduceKernel(const Tx* x_data, - Ty* y_data, - const ReduceOp& reducer, - const TransformOp& transform, - MPType init, - KPStream stream, - ReduceConfig config) { - if (config.reduce_type == kReduceLastDim) { - int stride_reduce = 1; - int stride_left = config.reduce_num; - // for higher performance - auto reduce_index_calculator = OneDimIndexCal(stride_reduce); - auto left_index_calculator = OneDimIndexCal(stride_left); - - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.block.y, - 0); - dim.SetRem(config.reduce_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#else - ReduceAnyKernel<<>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#endif - - } else { - int reduce_rank = config.reduce_strides.size(); - int left_rank = config.left_strides.size(); - auto reduce_index_calculator = IndexCalculator(reduce_rank, - config.reduce_dim, - config.reduce_strides, - config.x_strides); - auto left_index_calculator = IndexCalculator( - left_rank, config.left_dim, config.left_strides, config.x_strides); - - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.block.y, - 0); - dim.SetRem(config.reduce_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#else - ReduceAnyKernel<<>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#endif - } - - if (config.should_reduce_again) { - dim3 block; - dim3 grid; - if (config.reduce_last_dim) { - block = dim3(32, 1, 1); - grid = dim3(details::AlignUp(config.left_num, 32), 1, 1); - } else { - block = dim3(config.block.x, 1, 1); - grid = dim3(config.grid.x, 1, config.grid.z); - } - - auto last_index = OneDimIndexCal(1); - auto first_index = OneDimIndexCal(config.left_num); - kps::DimConfig dim = - kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); - dim.SetRem(config.left_num % block.x, 0, 0); -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#else - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#endif - } -} - -template class ReduceOp, - typename TransformOp> -static typename std::enable_if::value, - void>::type -CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - KPStream stream) { - auto reducer = ReduceOp(); - cub::TransformInputIterator trans_x(x_data, - transform); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, - temp_storage_bytes, - trans_x, - y_data, - reduce_num, - reducer, - reducer.initial(), - stream); - - phi::DenseTensor tmp = phi::DenseTensor( - phi::make_intrusive(place), - phi::DenseTensorMeta( - phi::DataType::UINT8, - phi::make_ddim({static_cast(temp_storage_bytes)}))); - - auto* temp_storage = tmp.mutable_data(place); - - cub::DeviceReduce::Reduce(temp_storage, - temp_storage_bytes, - trans_x, - y_data, - reduce_num, - reducer, - reducer.initial(), - stream); -} - -template class ReduceOp, - typename TransformOp> -static typename std::enable_if::value, - void>::type -CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - KPStream stream) { - PADDLE_THROW(phi::errors::InvalidArgument( - "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); -} - -template class ReduceOp, - typename TransformOp> -void TensorReduceImpl(const phi::GPUContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - KPStream stream) { - y->mutable_data(x.place()); - - auto x_dim = phi::vectorize(x.dims()); - auto config = ReduceConfig(origin_reduce_dims, x_dim); - config.Run(); - int numel = x.numel(); - // after config.run() - // SetOutputData for ReduceHigherDim when should_reduce_again is true, - // temp_output should be stored temp_data in output_data space or stored in - // y_data; - - phi::DDim tmp_ddim; - phi::DenseTensor tmp = phi::DenseTensor( - phi::make_intrusive(y->place()), - phi::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout())); - - auto x_data = x.data(); - auto y_data = y->data(); - - if (config.reduce_num == 1) { - std::vector inputs = {&x}; - std::vector outputs = {y}; - funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); - return; - } - - config.SetOutputData(y_data, x.place(), &tmp); - constexpr bool kIsTxFP16 = std::is_same::value; - bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; -#ifndef PADDLE_WITH_XPU_KP - if (use_cub_reduce) { - CubTensorReduceImpl( - x_data, y_data, transform, config.reduce_num, x.place(), stream); - return; - } -#endif - - using MPType = typename kps::details::MPTypeTrait::Type; - auto reducer = ReduceOp(); - // launch ReduceHigherDimKernel - // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this - // function will be used - // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 - // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / - // 32 - // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 - if (config.reduce_type == ReduceType::kReduceHigherDim) { - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.blocking_size, - 0); - dim.SetRem(config.left_num % config.block.x, - config.reduce_num % config.blocking_size, - 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel, - TransformOp><<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#else - ReduceHigherDimKernel< - Tx, - Ty, - MPType, - ReduceOp, - TransformOp><<>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#endif - - if (config.should_reduce_again) { - dim3 block = dim3(config.block.x, 1, 1); - dim3 grid = dim3(config.grid.x, 1, config.grid.z); - kps::DimConfig dim2 = - kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); - dim2.SetRem(config.left_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#else - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#endif - } - return; - } - - // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or - // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this - // function will be used - LaunchReduceKernel, TransformOp>( - x_data, y_data, reducer, transform, reducer.initial(), stream, config); -} - -} // namespace kernels template class ReduceOp, @@ -1252,7 +32,7 @@ void Reduce(const KPDevice& dev_ctx, DataType out_dtype, DenseTensor* out) { std::vector reduce_dims = - phi::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all); + phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); int reduce_num = 1; for (auto i : reduce_dims) { @@ -1271,10 +51,10 @@ void Reduce(const KPDevice& dev_ctx, "TensorReduceImpl", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - phi::kernels::TensorReduceImpl>( + phi::funcs::TensorReduceImpl>( dev_ctx, tmp_tensor, out, @@ -1284,7 +64,7 @@ void Reduce(const KPDevice& dev_ctx, })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - phi::kernels::TensorReduceImpl>( + phi::funcs::TensorReduceImpl>( dev_ctx, x, out, diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 7ac7c451b00542c3e0511692dc7cad470374f2ae..4266f0174ff6c17b2146576544ff090c7a272872 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -17,7 +17,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { @@ -34,7 +34,7 @@ void TraceKernel(const Context& ctx, auto stream = ctx.stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, diag, out, kps::IdentityFunctor(), reduce_dims, stream); } else { phi::funcs::SetConstant functor; diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..b4a6fe337c8d21e37beb0d6e5219e1a5edf1f9e8 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu @@ -0,0 +1,834 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + auto X = &input; + auto W = &filter; + auto dO = &out_grad; + auto ddX = input_grad_grad.get_ptr(); + auto ddW = filter_grad_grad.get_ptr(); + + auto ddO = out_grad_grad; + auto dW = filter_grad; + auto dX = input_grad; + if (ddO) { + ddO->mutable_data(ctx.GetPlace()); + phi::funcs::SetConstant set_zero; + set_zero(ctx, ddO, static_cast(0)); + } + if (dW) { + dW->mutable_data(ctx.GetPlace()); + } + if (dX) { + dX->mutable_data(ctx.GetPlace()); + } + + // const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + T* transformed_dx = nullptr; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensors to channel first----------- + DenseTensor transformed_X_channel(X->type()); + DenseTensor transformed_dO_channel(dO->type()); + DenseTensor transformed_ddX_channel(X->type()); + + DenseTensor transformed_ddO_channel(dO->type()); + DenseTensor transformed_dX_channel(X->type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, X, &transformed_X_channel); + TransToChannelFirst(ctx, X, &transformed_X_channel); + + ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); + TransToChannelFirst(ctx, dO, &transformed_dO_channel); + + if (ddX) { + ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); + TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); + } + + if (ddO) { + ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); + } + if (dX) { + ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); + transformed_dX_channel.mutable_data(ctx.GetPlace()); + } + + } else { + transformed_X_channel = *X; + transformed_dO_channel = *dO; + if (ddX) { + transformed_ddX_channel = *ddX; + } + if (ddO) { + transformed_ddO_channel.ShareDataWith(*ddO); + } + if (dX) { + transformed_dX_channel.ShareDataWith(*dX); + } + } + + auto in_dims = transformed_X_channel.dims(); + auto filter_dims = W->dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + DenseTensor transformed_X(X->type()); + DenseTensor transformed_ddX(X->type()); + + DenseTensor transformed_dX(X->type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(X->dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_X_channel.dims()[0]; + new_input_shape_vec[1] = transformed_X_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_X_channel.dims()[i + 2] + padding_diff[i]; + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_X.Resize(new_input_shape); + transformed_ddX.Resize(new_input_shape); + transformed_dX.Resize(new_input_shape); + + transformed_X.mutable_data(ctx.GetPlace()); + + if (ddX) { + transformed_ddX.mutable_data(ctx.GetPlace()); + } + if (dX) { + transformed_dX.mutable_data(ctx.GetPlace()); + } + + // pad for input + const int rank = X->dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_X.ShareDataWith(transformed_X_channel); + if (ddX) { + transformed_ddX.ShareDataWith(transformed_ddX_channel); + } + if (dX) { + transformed_dX.ShareDataWith(transformed_dX_channel); + } + + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* x = transformed_X.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + + paddle::operators::ConvArgs args1{&transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args3{&transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args4{&transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t fwd_algo1 = static_cast(0); + miopenConvFwdAlgorithm_t fwd_algo2 = static_cast(0); + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionFwdAlgo_t fwd_algo1 = + static_cast(0); + cudnnConvolutionFwdAlgo_t fwd_algo2 = + static_cast(0); + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + + T* transformed_ddy_channel = nullptr; + if (ddO) { + ddy = ddO->data(); + transformed_ddy_channel = transformed_ddO_channel.data(); + if (ddX) { + args1.handle = handle; + args1.idesc.set(transformed_ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(transformed_ddO_channel, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_algo1 = search1::Find( + args1, exhaustive_search, false, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); +#endif + } + + if (ddW) { + ddw = ddW->data(); + args2.handle = handle; + args2.idesc.set(transformed_X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(transformed_ddO_channel, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_algo2 = search2::Find( + args2, exhaustive_search, false, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); +#endif + } + } + + if (dW && ddX) { + dw = dW->data(); + args3.handle = handle; + args3.idesc.set(transformed_ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(transformed_dO_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search3 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = search3::Find( + args3, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search3 = + paddle::operators::SearchAlgorithm; + filter_algo = + search3::Find(args3, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif + } + + if (ddW && dX) { + transformed_dx = transformed_dX.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(transformed_dO_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search4 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = search4::Find( + args4, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search4 = + paddle::operators::SearchAlgorithm; + data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW( + transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dO_channel.dims(), + DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto wkspace_handle = ctx.cudnn_workspace_handle(); + + if (ddO) { + if (ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx, + args1.wdesc.desc(), + w, + args1.cdesc.desc(), + fwd_algo1, + &beta, + args1.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx + i * group_offset_in, + args1.wdesc.desc(), + w + i * group_offset_filter, + args1.cdesc.desc(), + fwd_algo1, + workspace_ptr, + workspace_size, + &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (ddW) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x, + args2.wdesc.desc(), + ddw, + args2.cdesc.desc(), + fwd_algo2, + &beta, + args2.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x + i * group_offset_in, + args2.wdesc.desc(), + ddw + i * group_offset_filter, + args2.cdesc.desc(), + fwd_algo2, + workspace_ptr, + workspace_size, + &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_ddO_channel, ddO); + } + } + T* transformed_dy_channel = transformed_dO_channel.data(); + if (dW && ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + transformed_dy_channel, + args3.idesc.desc(), + ddx, + args3.cdesc.desc(), + filter_algo, + &beta, + args3.wdesc.desc(), + dw, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + ddx + i * group_offset_in, + args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), + filter_algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } +#endif + } + + if (dX && ddW) { + ddw = ddW->data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args4.odesc.desc(), + transformed_dy_channel, + args4.wdesc.desc(), + ddw, + args4.cdesc.desc(), + data_algo, + &beta, + args4.idesc.desc(), + transformed_dx, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args4.wdesc.desc(), + ddw + i * group_offset_filter, + args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), + data_algo, + workspace_ptr, + workspace_size, + &beta, + args4.idesc.desc(), + transformed_dx + i * group_offset_in)); + }, + workspace_size); + } +#endif + + if (!is_sys_pad) { + // reverse padded input + std::vector starts(X->dims().size(), 0); + std::vector axes(X->dims().size(), 0); + + for (size_t i = 0; i < X->dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + if (X->dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dX_channel, dX); + } + } +} + +template +void DepthwiseConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + bool fuse_relu, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +template +void Conv3DCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +#else + +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..64148e902fdb2123aa3f81846999b5d90f356cd6 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu @@ -0,0 +1,683 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnGradKernel(const Context& ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + } + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + } + + std::vector dilations = dilations_t; + std::vector strides = strides_t; + std::vector paddings = paddings_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvGradOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // transform Tensor + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output_grad_channel(output_grad.type()); + DenseTensor transformed_input_grad_channel(input.type()); + DenseTensor transformed_filter_channel(filter.type()); + DenseTensor transformed_filter_grad_channel(filter.type()); + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input, output_grad, input_grad and tensor from " + "NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + TransToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + + if (input_grad) { + ResizeToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy + // the data of input_grad to transformed_input_grad_channel. + if (use_addto) { + TransToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + } + } + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output_grad_channel.ShareDataWith(output_grad); + if (input_grad) { + transformed_input_grad_channel.ShareDataWith(*input_grad); + } + } + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + + if (filter_grad) { + ResizeToChannelLast( + ctx, filter_grad, &transformed_filter_grad_channel); + } + } else { + transformed_filter_channel.ShareDataWith(filter); + if (filter_grad) { + transformed_filter_grad_channel.ShareDataWith(*filter_grad); + } + } + + // update paddings + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + // cuDNN only supports padding the same amount on every dimension. + // So we create a new padded input tensor. + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + Tensor transformed_input(input.type()); + Tensor transformed_input_grad(input.type()); + std::vector padding_common(data_dim, 0); + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + transformed_input_grad.Resize(new_input_shape); + + if (input_grad) { + transformed_input_grad.mutable_data(ctx.GetPlace()); + } + // pad for input + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (input_grad) { + transformed_input_grad.ShareDataWith(transformed_input_grad_channel); + } + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + const T* output_grad_data = transformed_output_grad_channel.data(); + const T* filter_data = transformed_filter_channel.data(); + T* filter_grad_data = nullptr; + T* input_grad_data = nullptr; + T* transformed_input_grad_data = nullptr; + + paddle::operators::ConvArgs args1{&transformed_input_grad, + &transformed_filter_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_input, + &transformed_filter_grad_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; +// ------------------- cudnn backward algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + // input data workspace_size + size_t workspace_size_d = 0; + // weight workspace_size + size_t workspace_size_w = 0; + int iwo_groups = groups; + int c_groups = 1; + +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (input_grad) { + // ------------------- cudnn descriptors --------------------- + input_grad_data = input_grad->data(); + transformed_input_grad_data = transformed_input_grad.data(); + + args1.handle = handle; + args1.idesc.set(transformed_input_grad, layout_tensor); + args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); + args1.odesc.set(transformed_output_grad_channel, layout_tensor); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); + data_algo = search1::Find( + args1, exhaustive_search, deterministic, workspace_size_d, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); +#endif + } + + if (filter_grad) { + // ------------------- cudnn descriptors --------------------- + filter_grad_data = transformed_filter_grad_channel.data(); + args2.handle = handle; + args2.idesc.set(transformed_input, layout_tensor); + args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); + args2.odesc.set(transformed_output_grad_channel, layout_tensor); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size_w = + std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); + filter_algo = search2::Find( + args2, exhaustive_search, deterministic, workspace_size_w, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + filter_algo = + search2::Find(args2, exhaustive_search, deterministic, ctx); + workspace_size_w = std::max(workspace_size_w, + search2::GetWorkspaceSize(args2, filter_algo)); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + paddle::operators::ScalingParamType beta = 0.0f; +#else + paddle::operators::ScalingParamType beta = use_addto ? 1.0f : 0.0f; + +#endif + VLOG(4) << "Conv_grad: use_addto = " << use_addto; + + if (input_grad) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. +#ifdef PADDLE_WITH_HIP + if (use_addto) { + DenseTensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + temp_tensor_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args1.idesc.desc(), + transformed_input_grad_data, + &alpha, + args1.idesc.desc(), + temp_tensor_data, + &beta, + args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + transformed_input_grad_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + } + +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args1.wdesc.desc(), + filter_data + i * group_offset_filter, + args1.odesc.desc(), + output_grad_data + i * group_offset_out, + args1.cdesc.desc(), + data_algo, + cudnn_workspace_ptr, + workspace_size_d, + &beta, + args1.idesc.desc(), + transformed_input_grad_data + i * group_offset_in)); + }, + workspace_size_d); + } +#endif + if (!is_sys_pad) { + std::vector starts(transformed_input_channel.dims().size(), 0); + std::vector axes(transformed_input_channel.dims().size(), 0); + + for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + + transformed_input_grad_channel.mutable_data(ctx.GetPlace()); + if (transformed_input_channel.dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } + } + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast( + ctx, &transformed_input_grad_channel, input_grad); + } + } + + // filter_grad do not use inplace addto. + paddle::operators::ScalingParamType beta_filter = 0.0f; + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { +// Because beta is zero, it is unnecessary to reset filter_grad. +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + output_grad_data, + args2.idesc.desc(), + input_data, + args2.cdesc.desc(), + filter_algo, + &beta, + args2.wdesc.desc(), + filter_grad_data, + cudnn_workspace_ptr, + workspace_size_w)); + }, + workspace_size_w); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args2.idesc.desc(), + input_data + i * group_offset_in, + args2.odesc.desc(), + output_grad_data + i * group_offset_out, + args2.cdesc.desc(), + filter_algo, + cudnn_workspace_ptr, + workspace_size_w, + &beta_filter, + args2.wdesc.desc(), + filter_grad_data + i * group_offset_filter)); + }, + workspace_size_w); + } +#endif + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + TransToChannelFirst( + ctx, &transformed_filter_grad_channel, filter_grad); + } + } +} + +template +void Conv3DCudnnGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu new file mode 100644 index 0000000000000000000000000000000000000000..931b6d68845e27297784603c2427178eae6b6f7d --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu @@ -0,0 +1,476 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* output) { + output->mutable_data(ctx.GetPlace()); + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + // Tensor Core introduced from Volta GPUs supports more faster conv op + // with FP16 in NHWC data format. + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + // We will only do data format conversion from NHWC to NCHW. + // cudnn will convert NCHW to NHWC automatically on Tensor Core. + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // ------------ transformed tensor ----------- + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_filter_channel(filter.type()); + T* output_data = nullptr; + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst(ctx, output, &transformed_output); + + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output.ShareDataWith(*output); + } + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + } else { + transformed_filter_channel.ShareDataWith(filter); + } + output_data = transformed_output.data(); + + // update padding and dilation + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + + DenseTensor transformed_input; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + + const T* filter_data = transformed_filter_channel.data(); + + // ------------------- cudnn descriptors --------------------- + paddle::operators::ConvArgs args{&transformed_input, + &transformed_filter_channel, + &transformed_output, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + + args.handle = handle; + +#ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + groups); +#else + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn()); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it manually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); + groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; +#endif + args.idesc.set(transformed_input, layout_format); + args.wdesc.set(transformed_filter_channel, layout_format, groups); + args.odesc.set(transformed_output, layout_format); + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size = 0; // final workspace to allocate. +// ------------------- cudnn conv algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t algo{}; + using search = paddle::operators::SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + algo = search::Find( + args, exhaustive_search, deterministic, workspace_size, ctx); +#else + cudnnConvolutionFwdAlgo_t algo{}; + using search = + paddle::operators::SearchAlgorithm; + algo = search::Find(args, exhaustive_search, deterministic, ctx); + workspace_size = search::GetWorkspaceSize(args, algo); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ + // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable + // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ + // FWD_ALGO_IMPLICIT_GEMM manually. + if (groups > 1) { + algo = static_cast(0); + } +#endif + + // ------------------- cudnn conv forward --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + +// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. +// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); + +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data, + args.wdesc.desc(), + filter_data, + args.cdesc.desc(), + algo, + &beta, + args.odesc.desc(), + output_data, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data + i * group_offset_in, + args.wdesc.desc(), + filter_data + i * group_offset_filter, + args.cdesc.desc(), + algo, + workspace_ptr, + workspace_size, + &beta, + args.odesc.desc(), + output_data + i * group_offset_out)); + }, + workspace_size); + } +#endif + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast(ctx, &transformed_output, output); + } +} + +template +void Conv3DCudnnKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvCudnnKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16) {} +#endif + +#endif + +// todo register bfloat16 diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h index eb01b83377cb62c7dc6147cd57edcd3c9c047f78..d7167704a4824b74572bc0e0dd53d7a5c3dbd8c7 100644 --- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -23,10 +23,10 @@ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h" -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(ctx, &in_tensors[i], out_tensors[i]); \ - break; \ +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(ctx, in_tensors[i], out_tensors[i]); \ + break; \ } namespace phi { @@ -75,7 +75,7 @@ void ApplyBroadcast(const Context& ctx, template void BroadcastTensorsKernel(const Context& ctx, - const std::vector& x, + const std::vector& x, std::vector out) { const auto& in_tensors = x; auto out_tensors = out; diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..93bc5b64adc170901aeffeadfa64d6b5d7ea8c60 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +DECLARE_bool(cudnn_deterministic); +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); + +namespace phi { + +static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { + return dev_ctx.GetComputeCapability() >= 70; +} + +// inline cudnnTensorFormat_t GetCudnnTensorFormat( +// const phi::DataLayout& order) { // Not use +// switch (order) { +// case phi::DataLayout::kNHWC: +// return CUDNN_TENSOR_NHWC; +// case phi::DataLayout::kNCHW: +// return CUDNN_TENSOR_NCHW; +// case phi::DataLayout::NCDHW: +// return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same +// case phi::DataLayout::NDHWC: +// return CUDNN_TENSOR_NHWC; // add, liyamei +// default: +// PADDLE_THROW(phi::errors::Unimplemented( +// "CUDNN has no equivalent dataLayout for input order.")); +// } +// return CUDNN_TENSOR_NCHW; +// } + +static inline void GetNCDHW(const DDim& dims, + const phi::DataLayout& layout, + int* N, + int* C, + int* D, + int* H, + int* W) { + *N = dims[0]; + *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == phi::DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +} // namespace phi + +// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double +// ) {} diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..fbcebf371a61bd3d652888b5eaad56185499726b --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h @@ -0,0 +1,330 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* X = &input; + const DenseTensor* dY = &out_grad; + const DenseTensor* ddX = input_grad_grad.get_ptr(); + const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); + + DenseTensor* ddY = out_grad_grad; + DenseTensor* dW = filter_grad; + DenseTensor* dX = input_grad; + DenseTensor W = filter; + + if (!ddY && !dW && !dX) return; + + const std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensor + DenseTensor transformed_X(X->type()); + DenseTensor transformed_dY(dY->type()); + DenseTensor transformed_ddX(X->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, X, &transformed_X); + TransToChannelFirst(dev_ctx, X, &transformed_X); + + ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); + TransToChannelFirst(dev_ctx, dY, &transformed_dY); + + if (ddX) { + ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); + TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); + } + } else { + transformed_X = *X; + transformed_dY = *dY; + if (ddX) { + transformed_ddX = *ddX; + } + } + + // update padding and dilation + auto in_dims = transformed_X.dims(); + auto filter_dims = W.dims(); + + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_X.dims()[0]); + std::vector filter_shape_vec(vectorize(W.dims())); + std::vector output_shape_vec(vectorize(transformed_dY.dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = transformed_X.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + DDim input_shape = + slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + DDim output_matrix_shape = { + transformed_dY.dims()[1], + transformed_dY.numel() / + (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; + int in_step = static_cast(transformed_X.dims()[1]) / groups; + int out_step = static_cast(transformed_dY.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + DenseTensor col; + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + dX->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_dX(dX->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); + + } else { + transformed_dX = *dX; + } + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, &transformed_dX, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul( + ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dX, dX); + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW && ddX) { + dW->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, dW, static_cast(0)); + DenseTensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + ddY->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_ddY(ddY->type()); + if (channel_last) { + ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); + } else { + transformed_ddY = *ddY; + } + + set_zero(dev_ctx, &transformed_ddY, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor ddy_batch = + transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + // gemm + DenseTensor ddy_slice = + ddy_batch.Slice(g * out_step, (g + 1) * out_step); + + if (ddX) { + DenseTensor ddx_batch = + transformed_ddX.Slice(i, i + 1).Resize(input_shape); + DenseTensor ddx_slice = + ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); + } + + if (ddW_in) { + DenseTensor x_batch = + transformed_X.Slice(i, i + 1).Resize(input_shape); + DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + + DenseTensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + x_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_ddY, ddY); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f1971aca800b59171a2e741dbebce6d8adaf7899 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -0,0 +1,257 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + + if (!input_grad && !filter_grad) return; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + DenseTensor filter = filter_t; + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output_grad(output_grad.type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + TransToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + } else { + transformed_input = input; + transformed_output_grad = output_grad; + } + + // update padding and dilation + auto in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + vectorize(transformed_output_grad.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = transformed_input.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + + DDim input_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output_grad.dims()[1], + transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * + transformed_output_grad.dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + DenseTensor transformed_input_grad(input_grad->type()); + if (channel_last) { + ResizeToChannelFirst( + dev_ctx, input_grad, &transformed_input_grad); + + } else { + transformed_input_grad = *input_grad; + } + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_grad_batch = + transformed_input_grad.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = + filter.Slice(g * out_step, (g + 1) * out_step); + + DenseTensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul(filter_slice, + true, + out_grad_slice, + false, + T(1.0), + &col_matrix, + T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + if (channel_last) { + TransToChannelLast( + dev_ctx, &transformed_input_grad, input_grad); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(out_grad_slice, + false, + col_matrix, + true, + T(1.0), + &filter_grad_slice, + T(1.0)); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1945468f02551b8e348687ae578c9f23a038b8ca --- /dev/null +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* output) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + DenseTensor filter = filter_t; + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + output->mutable_data(dev_ctx.GetPlace()); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output(output->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst(dev_ctx, output, &transformed_output); + + } else { + transformed_input = input; + transformed_output = *output; + } + + // update padding and dilation + auto trans_in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: + // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + + // output_shape_vec: + // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(vectorize(transformed_output.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: + // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, + // o_d,o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = trans_in_dims[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: + // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * + // o_w) + + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + // col = context.AllocateTmpTensor(col_shape, dev_ctx); + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + DDim in_matrix_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output.dims()[1], + transformed_output.numel() / + (transformed_output.dims()[0] * transformed_output.dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output.dims()[1]) / groups; + + paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math:: + Im2ColFunctor + im2col; + + auto blas = phi::funcs::GetBlas(dev_ctx); + for (int i = 0; i < batch_size; i++) { + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); + DenseTensor out_batch = + transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_output, output); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index f2549c171dda00ecab0baf8b6a7cdfb26ddea4d0..7c8d10e05653d0bacd0ff93d5363f0c6a617f0c3 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -60,7 +60,7 @@ struct ReduceSumForMatmulGrad { DenseTensor* output, const std::vector& reduce_dims) { auto stream = dev_ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); } }; diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a755fdb19ec4b86d4b5265c1d6bce5eecdb9b5b3 --- /dev/null +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, + phi::Conv2dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a036afac82a8d49455b1a226e62f5fe757d4b4b9 --- /dev/null +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv3dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, + phi::Conv3dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2b6801f73bcdb0f20090e3ea7e75b7257bde4e3 --- /dev/null +++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DepthwiseConv2dOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"Output"}); +} + +KernelSignature DepthwiseConv2dGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, + phi::DepthwiseConv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, + phi::DepthwiseConv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad, + phi::DepthwiseConv2dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index 69922c055cbac5fe3c3947d0d8d63ee4a1262a4c..a4e89231e14f82c34e85a426d0189053362800aa 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -43,7 +43,7 @@ template void FakeDot(const Context& dev_ctx, const phi::DenseTensor& x, const phi::DenseTensor& y, - const std::vector& fake_input_vec, + const std::vector& fake_input_vec, bool fake_attr_bool, int fake_attr_int, float fake_attr_float, diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc index 55dd6dce1aaaeddf7e1acfd1c458b15042d5a629..7f954085f601cdc0321fb133abd716112367b113 100644 --- a/paddle/phi/tests/kernels/test_concat_dev_api.cc +++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc @@ -53,7 +53,7 @@ TEST(DEV_API, concat) { } } - std::vector inputs = {dense_x, dense_y}; + std::vector inputs = {&dense_x, &dense_y}; // 2. test API phi::CPUContext dev_ctx; diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 75b27e4165d177c2285a6b109922ce159eeb8b82..fb7be82d1c5a5cbe9656efe1aa41222072977534 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -92,7 +92,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 41e5e0469dcb4099a89a0517207b8f6d8d39632b..175b4be295ee3af30e16715503cdf5e3a537ad79 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -330,7 +330,7 @@ function check_style() { # pre-commit use python3.8.0 OLD_PATH=$PATH - export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH} + export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH} pre-commit install clang-format --version @@ -2754,17 +2754,20 @@ function build_pr_and_develop() { fi git fetch upstream develop + git checkout develop dev_commit=`git log -1|head -1|awk '{print $2}'` dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'` if [ "$url_return" == '200' ];then - mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url} + mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url} + cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist else git checkout -b develop_base_pr upstream/$BRANCH cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} generate_api_spec "$1" "DEV" mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl fi + } function build_develop() { diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 807f7c151964e39c8d14136b91b5ecb15b88a4ce..49ae8f5fd56d375dd7d1b4b4d772892b7724b9b6 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -355,6 +355,8 @@ set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200) set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200) set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200) set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200) + if(LINUX AND WITH_MKLDNN) set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py index dc460cb16f68c14df2cd7f7f087c602b945ffc7d..ca77177125fcdddf198e6783939bf84b4ccd9b0e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py @@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py index f933d5bf7a48f14b0f4cb4f7ce274744f28c4c24..892fa649a6c5b31b54db8204acc76a7cc8794136 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py @@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg import paddle.nn.functional as F import paddle.fluid.initializer as I import unittest +import paddle def _reverse_repeat_list(t, n): @@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 826f886dab1725e9e26a8826a2277a2d99f93fb6..6a9f7a47f66cce3879e813836745b3e609affd50 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp): self.groups = 3 -#----------------Conv2DCUDNN---------------- +# #----------------Conv2DCUDNN---------------- create_test_cudnn_class(TestConv2DOp) create_test_cudnn_class(TestWithPad) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 5f23d04dde51cc21e66098ee6e37027bf82d7537..8cf779ccfdd4292f4cb6cbe74bf58b8ee7b37411 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -20,6 +20,7 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest import paddle.fluid as fluid +import paddle def conv3d_forward_naive(input, @@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py index 81c6aa1fd17d9ad16a1d24f32e5f55b3b71ca629..784d89b93f9859253a5722232954e7db1080afed 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase): def test_grad(self): places = [fluid.CPUPlace()] - places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) @@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase): [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps) def test_grad(self): - places = [fluid.CPUPlace()] + #places = [fluid.CPUPlace()] + places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: @@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py index cec48724da2fe08d0eefcdf0d2df3c54c9aa363d..8e0a744ecdbdabac1a248ed1f0a0d08934749e55 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py @@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py index 8ccaf30cbdb34bcd215bd6b76431c7a6acfeaa3e..6c208160658820d57456e46f86b76659f4e4f80d 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py @@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 9b739ebdfb23c680a86a54a3fa00398805ee8968..d391b04aa4772efbf7fadb7a9556aafd445197db 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 04a0e5e4cd10f7ece370e879986056d508c894ff..3e222e3c658ecd105811f3694a25d20f1826bcda 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -24,6 +24,7 @@ import paddle.fluid.core as core from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph import Linear +from paddle.fluid.framework import _test_eager_guard # Can use Amusic dataset as the DeepCF describes. DATA_PATH = os.environ.get('DATA_PATH', '') @@ -294,9 +295,42 @@ class TestDygraphDeepCF(unittest.TestCase): sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss2)) + with fluid.dygraph.guard(): + with _test_eager_guard(): + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + deepcf = DeepCF(num_users, num_items, matrix) + adam = fluid.optimizer.AdamOptimizer( + 0.01, parameter_list=deepcf.parameters()) + + for e in range(NUM_EPOCHES): + sys.stderr.write('epoch %d\n' % e) + for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): + if slice + BATCH_SIZE >= users_np.shape[0]: + break + prediction = deepcf( + to_variable(users_np[slice:slice + BATCH_SIZE]), + to_variable(items_np[slice:slice + BATCH_SIZE])) + loss = fluid.layers.reduce_sum( + fluid.layers.log_loss(prediction, + to_variable( + labels_np[slice:slice + + BATCH_SIZE]))) + loss.backward() + adam.minimize(loss) + deepcf.clear_gradients() + eager_loss = loss.numpy() + sys.stderr.write('eager loss: %s %s\n' % + (slice, eager_loss)) + self.assertEqual(static_loss, dy_loss) self.assertEqual(static_loss, dy_loss2) + self.assertEqual(static_loss, eager_loss) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py index d0b3adc490945377a4dd05f9c414cd5c35c7fae5..f12ca0a93ffd9441761c2da866c2c811a30c6e68 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py @@ -16,9 +16,11 @@ from __future__ import print_function import unittest import paddle.fluid as fluid +import paddle import paddle.fluid.core as core from paddle.fluid.dygraph.nn import Embedding import paddle.fluid.framework as framework +from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -60,6 +62,25 @@ class TestRecurrentFeed(unittest.TestCase): original_in1.stop_gradient = True rt.clear_gradients() + with fluid.dygraph.guard(): + with _test_eager_guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + original_in1 = to_variable(original_np1) + original_in2 = to_variable(original_np2) + original_in1.stop_gradient = False + original_in2.stop_gradient = False + rt = RecurrentTest("RecurrentTest") + + for i in range(3): + sum_out, out = rt(original_in1, original_in2) + original_in1 = out + eager_sum_out_value = sum_out.numpy() + sum_out.backward() + eager_dyout = out.gradient() + original_in1.stop_gradient = True + rt.clear_gradients() + with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -88,8 +109,11 @@ class TestRecurrentFeed(unittest.TestCase): original_np1 = static_out_value self.assertTrue(np.array_equal(static_sum_out, sum_out_value)) + self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value)) self.assertTrue(np.array_equal(static_dout, dyout)) + self.assertTrue(np.array_equal(static_dout, eager_dyout)) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 33f304ef33d67004585a01142d24b17bfd4908bc..0a08aa4ba12693e2216ae1b131ea41a5abaabd2a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 6c07cdec2ee19c9e689f354d4a5314049235402c..601248a41763989df5e0324f16180ce2df2314dd 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -458,7 +458,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. elif self.inputs['input_info'][ param] == "const std::vector&": meta_tensor_code = meta_tensor_code + f""" -{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param}); +{code_indent} auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); {code_indent} std::vector {param}_metas({param}_meta_vec.size()); {code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{ {code_indent} {param}_metas[i] = &{param}_meta_vec[i]; @@ -502,7 +502,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. input_trans_map = { 'const Tensor&': 'const phi::DenseTensor&', 'const std::vector&': - 'const std::vector&', + 'const std::vector&', 'const paddle::optional&': 'paddle::optional', 'const paddle::optional>&': @@ -539,9 +539,22 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. {code_indent} }}""" else: - input_tensor_code = input_tensor_code + f""" + if self.inputs['input_info'][input_name] == "const Tensor&": + input_tensor_code = input_tensor_code + f""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});""" + elif self.inputs['input_info'][ + input_name] == "const std::vector&": + input_tensor_code = input_tensor_code + f""" +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag}); +{code_indent} std::vector {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size()); +{code_indent} for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{ +{code_indent} {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i); +{code_indent} }}""" + + else: + # do nothing + pass else: if input_name in self.optional_vars: input_tensor_code = input_tensor_code + f""" @@ -561,7 +574,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. if param in self.optional_vars: kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " else: - kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + if self.inputs['input_info'][param] == "const Tensor&": + kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + elif self.inputs['input_info'][ + input_name] == "const std::vector&": + kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " + else: + # do nothing + pass kernel_in_type = input_trans_map[input_infos[param]] kernel_args_type_list.append(kernel_in_type) elif param in attr_names: diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py index 03cf6828cc7e19eedd7b2cd1375b93859e9f3cfa..0d633cfc60a9b6cddc669da0dbc87667f8211714 100644 --- a/tools/infrt/fake_models/multi_fc.py +++ b/tools/infrt/fake_models/multi_fc.py @@ -19,7 +19,6 @@ import sys, os import numpy as np import paddle import paddle.fluid as fluid -from paddle.fluid.backward import append_backward size = 2 num_layers = 4 diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 8efa03306fb1de4a4a39256c881d87479a8ac25a..f3a78a8d4e8597157e197bf0c0e82784293933f7 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -16,7 +16,7 @@ import json import sys attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'} -supported_kernels = ['sign', 'dot', 'digamma', 'conj'] +supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw'] target_type_converter = {"CPU": "CPU", "GPU": "GPU"} layout_type_converter = { @@ -66,7 +66,8 @@ def generate_attrs_info(op_name, attrs_info): 'digamma': [], 'lerp': [], 'cast': ['out_dtype', 'in_dtype'], - 'abs': [] + 'abs': [], + 'add_raw': ['axis'], } attrs_args_ = "" if len(kernel_attrs_names[op_name]) == len(attrs_info):