未验证 提交 d50fb43e 编写于 作者: H hong 提交者: GitHub

Move conv to pten (#39354)

* move conv to pten

* move conv to pten; test=develop

* fix bug;

* add conv cudnn impl; test=develop

* update

* update operator; test=develop

* fix bug; test=develop

* move operator and prepared_operator to develop; test=develop

* resolve conflict; test=develop

* remove useless code;test=develop

* add depency ; test=develop

* fix bug;

* add sig.cc ; test=develop

* fix use_op error; test=develop

* fix bug; test=develop

* fix bug; test=develop

* add conv3d register; test=develop

* fix star gan and conv_nn_grad test failed; test=develop

* add header; test=develop

* manul to recover to develop;

* resolve confilct; test=develop

* remove useless code

* fix bug;

* remove conv2d_cudnn; test=develop

* fix bugs; test=develop

* fix cpu rocm compile bugs; test=develop

* fix blas error; test=develop

* fix compile bug; test=develop

* fix windows compile error; test=develop

* fix windows error; test=develop

* resolve confilct; test=develop
上级 eaacf8bf
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_OP(conv2d); USE_OP_ITSELF(conv2d);
USE_OP(conv2d_transpose); USE_OP(conv2d_transpose);
namespace paddle { namespace paddle {
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, ...@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
} }
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const framework::ExecutionContext& context, static void RemovePaddingSlice(const phi::GPUContext& context,
const Tensor* input, Tensor* out, const Tensor* input, Tensor* out,
const std::vector<int>& starts, const std::vector<int>& starts,
const std::vector<int>& axes) { const std::vector<int>& axes) {
auto& place = auto& place = *context.eigen_device();
*context.template device_context<DeviceContext>().eigen_device();
auto in_dims = input->dims(); auto in_dims = input->dims();
auto new_out_dims = out->dims(); auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>(); auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
...@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results, ...@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
using framework::ConvSearchCache; using framework::ConvSearchCache;
static void SetConvMathType(const framework::ExecutionContext& ctx, static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) { const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx;
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH)); cdesc.desc(), CUDNN_TENSOR_OP_MATH));
...@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, const phi::GPUContext& ctx) {
const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
bool has_got_workspace_size = true; bool has_got_workspace_size = true;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
...@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
} else if (deterministic) { } else if (deterministic) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(1); algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
} else { } else {
auto& dev_ctx = auto& dev_ctx = ctx;
ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache = AlgorithmsCache<algo_t>& algo_cache =
...@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, const phi::GPUContext& ctx) {
const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0; size_t workspace_size = 0;
...@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
} else if (deterministic) { } else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else { } else {
auto& dev_ctx = auto& dev_ctx = ctx;
ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache = AlgorithmsCache<algo_t>& algo_cache =
...@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, const phi::GPUContext& ctx) {
const framework::ExecutionContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard; platform::CUDAGraphCaptureModeGuard guard;
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
...@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
} else if (deterministic) { } else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else { } else {
auto& dev_ctx = auto& dev_ctx = ctx;
ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache = AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardFilter()); *(framework::ConvSearchCache::Instance().GetBackwardFilter());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the spopecific language governing permissions and
limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout;
static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
return dev_ctx.GetComputeCapability() >= 70;
}
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
const Tensor* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
ctx.Attr<bool>("exhaustive_search"));
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
platform::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
// HIP MIOPEN ONLY SUPPORT NCHW format
auto compute_format = DataLayout::kNCHW;
#else
// Tensor Core introduced from Volta GPUs supports more faster conv op
// with FP16 in NHWC data format.
const bool compute_in_nhwc =
dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
// We will only do data format conversion from NHWC to NCHW.
// cudnn will convert NCHW to NHWC automatically on Tensor Core.
auto compute_format =
compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
#endif
VLOG(3) << "Compute ConvOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
// ------------ transformed tensor -----------
Tensor transformed_input_channel(input->type());
Tensor transformed_output(output->type());
Tensor transformed_filter_channel(filter->type());
T* output_data = nullptr;
if (channel_last && compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, output,
&transformed_output);
} else {
transformed_input_channel.ShareDataWith(*input);
transformed_output.ShareDataWith(*output);
}
if (compute_format == DataLayout::kNHWC) {
VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
TransToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
} else {
transformed_filter_channel.ShareDataWith(*filter);
}
output_data = transformed_output.data<T>();
// update padding and dilation
auto in_dims = transformed_input_channel.dims();
auto filter_dims = transformed_filter_channel.dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (compute_format == DataLayout::kNCHW) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims =
phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
0);
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
transformed_input =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
const int rank = transformed_input_channel.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_input.ShareDataWith(transformed_input_channel);
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = transformed_input.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors ---------------------
ConvArgs args{&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
padding_common,
dilations,
dtype};
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
: DataLayout::kNCDHW;
}
auto layout_format = GetCudnnTensorFormat(layout);
args.handle = handle;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), groups);
#else
args.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn());
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it manually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
args.cdesc.desc(), groups));
groups = 1;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups = 1;
#endif
args.idesc.set(transformed_input, layout_format);
args.wdesc.set(transformed_filter_channel, layout_format, groups);
args.odesc.set(transformed_output, layout_format);
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == DataLayout::kNHWC) {
GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
&o_h, &o_w);
} else {
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
&o_h, &o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t algo{};
using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args);
algo = search::Find<T>(args, exhaustive_search, deterministic,
workspace_size, ctx);
#else
cudnnConvolutionFwdAlgo_t algo{};
using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
workspace_size = search::GetWorkspaceSize(args, algo);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
// FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// FWD_ALGO_IMPLICIT_GEMM manually.
if (ctx.Attr<int>("groups") > 1) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
}
#endif
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
#ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
handle, &alpha, args.idesc.desc(), input_data,
args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
&beta, args.odesc.desc(), output_data, workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
handle, &alpha, args.idesc.desc(),
input_data + i * group_offset_in, args.wdesc.desc(),
filter_data + i * group_offset_filter, args.cdesc.desc(),
algo, workspace_ptr, workspace_size, &beta,
args.odesc.desc(), output_data + i * group_offset_out));
},
workspace_size);
}
#endif
if (channel_last && compute_format == DataLayout::kNCHW) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_output, output);
}
}
};
template <typename T>
class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
int groups = ctx.Attr<int>("groups");
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
ctx.Attr<bool>("exhaustive_search"));
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
platform::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
// HIP MIOPEN ONLY SUPPORT NCHW format
auto compute_format = DataLayout::kNCHW;
#else
const bool compute_in_nhwc =
dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
auto compute_format =
compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
#endif
VLOG(3) << "Compute ConvGradOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
// transform Tensor
Tensor transformed_input_channel(input->type());
Tensor transformed_output_grad_channel(output_grad->type());
Tensor transformed_input_grad_channel(input->type());
Tensor transformed_filter_channel(filter->type());
Tensor transformed_filter_grad_channel(filter->type());
if (channel_last && compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
"NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, output_grad, &transformed_output_grad_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, output_grad, &transformed_output_grad_channel);
if (input_grad) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input_grad, &transformed_input_grad_channel);
// NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
// the data of input_grad to transformed_input_grad_channel.
if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input_grad, &transformed_input_grad_channel);
}
}
} else {
transformed_input_channel.ShareDataWith(*input);
transformed_output_grad_channel.ShareDataWith(*output_grad);
if (input_grad) {
transformed_input_grad_channel.ShareDataWith(*input_grad);
}
}
if (compute_format == DataLayout::kNHWC) {
VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
TransToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
if (filter_grad) {
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter_grad, &transformed_filter_grad_channel);
}
} else {
transformed_filter_channel.ShareDataWith(*filter);
if (filter_grad) {
transformed_filter_grad_channel.ShareDataWith(*filter_grad);
}
}
// update paddings
auto in_dims = transformed_input_channel.dims();
auto filter_dims = transformed_filter_channel.dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (compute_format == DataLayout::kNCHW) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims =
phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
// cuDNN only supports padding the same amount on every dimension.
// So we create a new padded input tensor.
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input(input->type());
Tensor transformed_input_grad(input->type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
transformed_input_grad.Resize(new_input_shape);
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
transformed_input =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
if (input_grad) {
transformed_input_grad =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
}
// pad for input
const int rank = transformed_input_channel.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_input.ShareDataWith(transformed_input_channel);
if (input_grad) {
transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
}
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = transformed_input.data<T>();
const T* output_grad_data = transformed_output_grad_channel.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
T* filter_grad_data = nullptr;
T* input_grad_data = nullptr;
T* transformed_input_grad_data = nullptr;
ConvArgs args1{&transformed_input_grad,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype};
ConvArgs args2{&transformed_input,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype};
auto handle = dev_ctx.cudnn_handle();
DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
: DataLayout::kNCDHW;
}
auto layout_tensor = GetCudnnTensorFormat(layout);
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == DataLayout::kNHWC) {
GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
&o_c, &o_d, &o_h, &o_w);
} else {
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
&o_c, &o_d, &o_h, &o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t data_algo =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionBwdDataAlgo_t data_algo =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
// input data workspace_size
size_t workspace_size_d = 0;
// weight workspace_size
size_t workspace_size_w = 0;
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
if (input_grad) {
// ------------------- cudnn descriptors ---------------------
input_grad_data = input_grad->data<T>();
transformed_input_grad_data = transformed_input_grad.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_input_grad, layout_tensor);
args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
args1.odesc.set(transformed_output_grad_channel, layout_tensor);
args1.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
workspace_size_d, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo =
search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size_d = std::max(workspace_size_d,
search1::GetWorkspaceSize(args1, data_algo));
#endif
}
if (filter_grad) {
// ------------------- cudnn descriptors ---------------------
filter_grad_data = transformed_filter_grad_channel.data<T>();
args2.handle = handle;
args2.idesc.set(transformed_input, layout_tensor);
args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
iwo_groups);
args2.odesc.set(transformed_output_grad_channel, layout_tensor);
args2.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
workspace_size_w, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size_w = std::max(
workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
#endif
}
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f;
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
ScalingParamType<T> beta = 0.0f;
#else
ScalingParamType<T> beta =
(ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
#endif
VLOG(4) << "Conv_grad: use_addto = "
<< (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
if (input_grad) {
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
#ifdef PADDLE_WITH_HIP
if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
Tensor temp_tensor(transformed_input_grad.type());
temp_tensor.Resize(transformed_input_grad.dims());
T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args1.odesc.desc(), output_grad_data,
args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
cudnn_workspace_ptr, workspace_size_d));
},
workspace_size_d);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
transformed_input_grad_data, &alpha, args1.idesc.desc(),
temp_tensor_data, &beta, args1.idesc.desc(),
transformed_input_grad_data));
} else {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args1.odesc.desc(), output_grad_data,
args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(),
transformed_input_grad_data, cudnn_workspace_ptr,
workspace_size_d));
},
workspace_size_d);
}
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, args1.wdesc.desc(),
filter_data + i * group_offset_filter, args1.odesc.desc(),
output_grad_data + i * group_offset_out,
args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
workspace_size_d, &beta, args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size_d);
}
#endif
if (!is_sys_pad) {
std::vector<int> starts(transformed_input_channel.dims().size(), 0);
std::vector<int> axes(transformed_input_channel.dims().size(), 0);
for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
starts[i] = input_pad[2 * i];
axes[i] = i;
}
transformed_input_grad_channel.mutable_data(ctx.GetPlace());
if (transformed_input_channel.dims().size() == 4) {
RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_input_grad, &transformed_input_grad_channel,
starts, axes);
} else {
RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
ctx, &transformed_input_grad, &transformed_input_grad_channel,
starts, axes);
}
}
if (channel_last && compute_format == DataLayout::kNCHW) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_input_grad_channel, input_grad);
}
}
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
#ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args2.odesc.desc(), output_grad_data,
args2.idesc.desc(), input_data, args2.cdesc.desc(),
filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
cudnn_workspace_ptr, workspace_size_w));
},
workspace_size_w);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, args2.idesc.desc(),
input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size_w, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size_w);
}
#endif
if (compute_format == DataLayout::kNHWC) {
TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_filter_grad_channel, filter_grad);
}
}
}
};
/*
* Inputs: I, W, dO, ddI, ddW
* Outputs: ddO, dW, dI
* ddo = conv(ddI, W) + conv(I, ddW)
* dW = conv_bp_filter(ddI, dO)
* dI = conv_bp_data(ddW, dO)
*/
template <typename T>
class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
auto X = ctx.Input<Tensor>("Input");
auto W = ctx.Input<Tensor>("Filter");
auto dO = ctx.Input<Tensor>("DOutput");
auto ddX = ctx.Input<Tensor>("DDInput");
auto ddW = ctx.Input<Tensor>("DDFilter");
auto ddO = ctx.Output<Tensor>("DDOutput");
auto dW = ctx.Output<Tensor>("DFilter");
auto dX = ctx.Output<Tensor>("DInput");
if (ddO) {
ddO->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(dev_ctx, ddO, static_cast<T>(0));
}
if (dW) {
dW->mutable_data<T>(ctx.GetPlace());
}
if (dX) {
dX->mutable_data<T>(ctx.GetPlace());
}
// const T* x = X->data<T>();
const T* dy = dO->data<T>();
const T* w = W->data<T>();
const T* ddx = nullptr;
const T* ddw = nullptr;
T *dw, *dx, *ddy;
dw = dx = ddy = nullptr;
T* transformed_dx = nullptr;
const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
ctx.Attr<bool>("exhaustive_search"));
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
platform::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform Tensors to channel first-----------
Tensor transformed_X_channel(X->type());
Tensor transformed_dO_channel(dO->type());
Tensor transformed_ddX_channel(X->type());
Tensor transformed_ddO_channel(dO->type());
Tensor transformed_dX_channel(X->type());
if (channel_last) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, X, &transformed_X_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, X, &transformed_X_channel);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dO, &transformed_dO_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dO, &transformed_dO_channel);
if (ddX) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddX, &transformed_ddX_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddX, &transformed_ddX_channel);
}
if (ddO) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddO, &transformed_ddO_channel);
}
if (dX) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dX, &transformed_dX_channel);
transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
}
} else {
transformed_X_channel = *X;
transformed_dO_channel = *dO;
if (ddX) {
transformed_ddX_channel = *ddX;
}
if (ddO) {
transformed_ddO_channel.ShareDataWith(*ddO);
}
if (dX) {
transformed_dX_channel.ShareDataWith(*dX);
}
}
auto in_dims = transformed_X_channel.dims();
auto filter_dims = W->dims();
framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type());
Tensor transformed_dX(X->type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(X->dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_X_channel.dims()[0];
new_input_shape_vec[1] = transformed_X_channel.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_X_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
}
framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
transformed_X.Resize(new_input_shape);
transformed_ddX.Resize(new_input_shape);
transformed_dX.Resize(new_input_shape);
transformed_X =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
if (ddX) {
transformed_ddX =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
}
if (dX) {
transformed_dX =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
}
// pad for input
const int rank = X->dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_X.ShareDataWith(transformed_X_channel);
if (ddX) {
transformed_ddX.ShareDataWith(transformed_ddX_channel);
}
if (dX) {
transformed_dX.ShareDataWith(transformed_dX_channel);
}
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* x = transformed_X.data<T>();
int iwo_group = groups;
int c_group = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_group = 1;
c_group = groups;
groups = 1;
#endif
auto dtype = platform::CudnnDataType<T>::type;
auto handle = dev_ctx.cudnn_handle();
ConvArgs args1{&transformed_ddX,
W,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype};
ConvArgs args2{
&transformed_X, ddW, &transformed_ddO_channel, strides, padding_common,
dilations, dtype};
ConvArgs args3{&transformed_ddX,
dW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype};
ConvArgs args4{
&transformed_dX, ddW, &transformed_dO_channel, strides, padding_common,
dilations, dtype};
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t fwd_algo1 =
static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvFwdAlgorithm_t fwd_algo2 =
static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvBwdDataAlgorithm_t data_algo =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionFwdAlgo_t fwd_algo1 =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionFwdAlgo_t fwd_algo2 =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionBwdDataAlgo_t data_algo =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
// ddo = conv(ddI, W) + conv(I, ddW)
size_t workspace_size = 0;
T* transformed_ddy_channel = nullptr;
if (ddO) {
ddy = ddO->data<T>();
transformed_ddy_channel = transformed_ddO_channel.data<T>();
if (ddX) {
args1.handle = handle;
args1.idesc.set(transformed_ddX, iwo_group);
args1.wdesc.set(*W, layout, iwo_group);
args1.odesc.set(transformed_ddO_channel, iwo_group);
args1.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
#endif
}
if (ddW) {
ddw = ddW->data<T>();
args2.handle = handle;
args2.idesc.set(transformed_X, iwo_group);
args2.wdesc.set(*ddW, layout, iwo_group);
args2.odesc.set(transformed_ddO_channel, iwo_group);
args2.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, fwd_algo2));
#endif
}
}
if (dW && ddX) {
dw = dW->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_ddX, iwo_group);
args3.wdesc.set(*dW, layout, iwo_group);
args3.odesc.set(transformed_dO_channel, iwo_group);
args3.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
workspace_size, ctx);
#else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo));
#endif
}
if (ddW && dX) {
transformed_dx = transformed_dX.data<T>();
args4.handle = handle;
args4.idesc.set(transformed_dX, iwo_group);
args4.wdesc.set(*ddW, layout, iwo_group);
args4.odesc.set(transformed_dO_channel, iwo_group);
args4.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
workspace_size, ctx);
#else
using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo =
search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
}
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
&i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
&o_h, &o_w);
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
if (ddO) {
if (ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
handle, &alpha, args1.idesc.desc(), ddx,
args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
&beta, args1.odesc.desc(), transformed_ddy_channel,
workspace_ptr, workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
handle, &alpha, args1.idesc.desc(),
ddx + i * group_offset_in, args1.wdesc.desc(),
w + i * group_offset_filter, args1.cdesc.desc(),
fwd_algo1, workspace_ptr, workspace_size, &beta,
args1.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif
}
if (ddW) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
ddw, args2.cdesc.desc(), fwd_algo2, &beta,
args2.odesc.desc(), transformed_ddy_channel,
workspace_ptr, workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
handle, &alpha, args2.idesc.desc(),
x + i * group_offset_in, args2.wdesc.desc(),
ddw + i * group_offset_filter, args2.cdesc.desc(),
fwd_algo2, workspace_ptr, workspace_size, &alpha,
args2.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif
}
if (channel_last) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_ddO_channel, ddO);
}
}
T* transformed_dy_channel = transformed_dO_channel.data<T>();
if (dW && ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
&beta, args3.wdesc.desc(), dw, workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, args3.idesc.desc(),
ddx + i * group_offset_in, args3.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args3.cdesc.desc(), filter_algo, workspace_ptr,
workspace_size, &beta, args3.wdesc.desc(),
dw + i * group_offset_filter));
},
workspace_size);
}
#endif
}
if (dX && ddW) {
ddw = ddW->data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
&beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, args4.wdesc.desc(),
ddw + i * group_offset_filter, args4.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args4.cdesc.desc(), data_algo, workspace_ptr,
workspace_size, &beta, args4.idesc.desc(),
transformed_dx + i * group_offset_in));
},
workspace_size);
}
#endif
if (!is_sys_pad) {
// reverse padded input
std::vector<int> starts(X->dims().size(), 0);
std::vector<int> axes(X->dims().size(), 0);
for (size_t i = 0; i < X->dims().size(); ++i) {
starts[i] = input_pad[2 * i];
axes[i] = i;
}
if (X->dims().size() == 4) {
RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
} else {
RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
}
}
if (channel_last) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_dX_channel, dX);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
// Use depthwise_conv2d in MIOPEN to resolve this issue
REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>);
REGISTER_OP_KERNEL(
conv3d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>,
paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
#else
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
#endif
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
REGISTER_OP_KERNEL(
conv3d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
#endif
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, ...@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
} }
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const framework::ExecutionContext& context, static void RemovePaddingSlice(const phi::GPUContext& context,
const Tensor* input, Tensor* out, const Tensor* input, Tensor* out,
const std::vector<int>& starts, const std::vector<int>& starts,
const std::vector<int>& axes) { const std::vector<int>& axes) {
auto& place = auto& place = *context.eigen_device();
*context.template device_context<DeviceContext>().eigen_device();
auto in_dims = input->dims(); auto in_dims = input->dims();
auto new_out_dims = out->dims(); auto new_out_dims = out->dims();
auto offsets = Eigen::array<int, D>(); auto offsets = Eigen::array<int, D>();
...@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { ...@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const phi::GPUContext& ctx) {
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto workspace_handle = ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
int find_count; int find_count;
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
...@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> { ...@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const phi::GPUContext& ctx) {
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto workspace_handle = ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
int find_count; int find_count;
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
...@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> { ...@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const phi::GPUContext& ctx) {
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto workspace_handle = ctx.cudnn_workspace_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
int find_count; int find_count;
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
......
...@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(input_data_type),
paddle::framework::DataTypeToString(filter_data_type))); paddle::framework::DataTypeToString(filter_data_type)));
} }
#ifndef PADDLE_WITH_ASCEND_CL // #ifndef PADDLE_WITH_ASCEND_CL
if (input_data_type == framework::proto::VarType::FP16) { // if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ( // PADDLE_ENFORCE_EQ(
library, framework::LibraryType::kCUDNN, // library, framework::LibraryType::kCUDNN,
platform::errors::InvalidArgument( // platform::errors::InvalidArgument(
"float16 can only be used when CUDNN or NPU is used")); // "float16 can only be used when CUDNN or NPU is used"));
} // }
#endif // #endif
#if PADDLE_WITH_CUDA #if PADDLE_WITH_CUDA
if (input_data_type == framework::proto::VarType::BF16 && if (input_data_type == framework::proto::VarType::BF16 &&
library == framework::LibraryType::kCUDNN) { library == framework::LibraryType::kCUDNN) {
...@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ...@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>); ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad_grad,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad_grad,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(conv2d) REGISTER_OP_VERSION(conv2d)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d_grad,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -21,7 +21,6 @@ limitations under the License. */ ...@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/layout_utils.h" #include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
...@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { ...@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override; const framework::ExecutionContext& ctx) const override;
}; };
template <typename DeviceContext, typename T>
class GemmConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
const int groups = context.Attr<int>("groups");
const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
const std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
const std::string data_format = context.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
Tensor transformed_input(input->dtype());
Tensor transformed_output(output->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(context, input,
&transformed_input);
TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
ResizeToChannelFirst<DeviceContext, T>(context, output,
&transformed_output);
} else {
transformed_input = *input;
transformed_output = *output;
}
// update padding and dilation
auto trans_in_dims = transformed_input.dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims =
phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
auto& dev_ctx = context.template device_context<DeviceContext>();
const int batch_size = static_cast<int>(transformed_input.dims()[0]);
// filter_shape_vec:
// {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
// output_shape_vec:
// {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(
phi::vectorize(transformed_output.dims()));
// use col_shape in the im2col calculation
// col_shape_vec:
// {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
// o_d,o_h, o_w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = trans_in_dims[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size:
// (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
// o_w)
framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
if (is_expand) {
col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim in_matrix_shape = phi::slice_ddim(
transformed_input.dims(), 1, transformed_input.dims().size());
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
transformed_output.dims()[1],
transformed_output.numel() /
(transformed_output.dims()[0] * transformed_output.dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
math::Vol2ColFunctor<DeviceContext, T> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
for (int i = 0; i < batch_size; i++) {
Tensor in_batch =
transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
Tensor out_batch =
transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx, in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
T(0.0));
}
}
if (channel_last) {
TransToChannelLast<DeviceContext, T>(context, &transformed_output,
output);
}
}
};
template <typename DeviceContext, typename T>
class GemmConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
// The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
int groups = context.Attr<int>("groups");
const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
const std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
const std::string data_format = context.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
Tensor transformed_input(input->dtype());
Tensor transformed_output_grad(output_grad->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(context, input,
&transformed_input);
TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
&transformed_output_grad);
TransToChannelFirst<DeviceContext, T>(context, output_grad,
&transformed_output_grad);
} else {
transformed_input = *input;
transformed_output_grad = *output_grad;
}
// update padding and dilation
auto in_dims = transformed_input.dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(transformed_input.dims()[0]);
auto& dev_ctx = context.template device_context<DeviceContext>();
// filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
// output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(
phi::vectorize(transformed_output_grad.dims()));
// use col_shape in the im2col calculation
// col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
// o_h, o_w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = transformed_input.dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (i_c/g * k_h * k_w, o_h * o_w)
// or
// (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
framework::DDim col_matrix_shape =
phi::flatten_to_2d(col_shape, data_dim + 1);
framework::DDim input_shape = phi::slice_ddim(
transformed_input.dims(), 1, transformed_input.dims().size());
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
transformed_output_grad.dims()[1],
transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
transformed_output_grad.dims()[1])};
// convolution backward input operator: gemm + col2im(or col2vol)
// convolution backward weight operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
if (is_expand) {
col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
Tensor transformed_input_grad(input_grad->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
&transformed_input_grad);
} else {
transformed_input_grad = *input_grad;
}
// if is_expand is false, the operation of set_zero is unnecessary,
// because math::matmul will reset input_grad.
if (is_expand) {
set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
}
math::Col2VolFunctor<DeviceContext, T> col2vol;
math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch =
transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix.ShareDataWith(in_grad_slice);
col_matrix.Resize(col_matrix_shape);
}
blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
&col_matrix, T(0.0));
if (is_expand && data_dim == 2U) {
col2im(dev_ctx, col, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&in_grad_slice);
} else if (is_expand && data_dim == 3U) {
col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
}
}
}
if (channel_last) {
TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
input_grad);
}
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
math::Vol2ColFunctor<DeviceContext, T> vol2col;
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx, in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor filter_grad_slice =
filter_grad_.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
&filter_grad_slice, T(1.0));
}
}
}
}
};
template <typename DeviceContext, typename T>
class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_cpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
const Tensor* X = ctx.Input<Tensor>("Input");
const Tensor* dY = ctx.Input<Tensor>("DOutput");
const Tensor* ddX = ctx.Input<Tensor>("DDInput");
const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
Tensor* ddY = ctx.Output<Tensor>("DDOutput");
Tensor* dW = ctx.Output<Tensor>("DFilter");
Tensor* dX = ctx.Output<Tensor>("DInput");
Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
"GemmConvDoubleGrad");
if (!ddY && !dW && !dX) return;
const int groups = ctx.Attr<int>("groups");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform Tensor
Tensor transformed_X(X->dtype());
Tensor transformed_dY(dY->dtype());
Tensor transformed_ddX(X->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
TransToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
ResizeToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
TransToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
if (ddX) {
ResizeToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
TransToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
}
} else {
transformed_X = *X;
transformed_dY = *dY;
if (ddX) {
transformed_ddX = *ddX;
}
}
// update padding and dilation
auto in_dims = transformed_X.dims();
auto filter_dims = W.dims();
framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(transformed_X.dims()[0]);
std::vector<int64_t> filter_shape_vec(phi::vectorize(W.dims()));
std::vector<int64_t> output_shape_vec(
phi::vectorize(transformed_dY.dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
// col_shape [in_channel/group, kh, kw, oh, ow]
col_shape_vec[0] = transformed_X.dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_shape_vec));
// col_matrix_shape [in_channel/group * kh * kw, oh * ow]
framework::DDim col_matrix_shape =
phi::flatten_to_2d(col_shape, data_dim + 1);
// input_shape [Cin, H, W]
framework::DDim input_shape =
phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
// filter_matrix_shape [Cout, Cin * kh * kw]
framework::DDim filter_matrix_shape = {W.dims()[0],
W.numel() / W.dims()[0]};
W.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
transformed_dY.dims()[1],
transformed_dY.numel() /
(transformed_dY.dims()[0] * transformed_dY.dims()[1])};
int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// dx convolution double grad: gemm + col2im(col2vol)
// dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
// oH, oW)
if (dX && ddW_in) {
Tensor ddW;
ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
dX->mutable_data<T>(ctx.GetPlace());
Tensor transformed_dX(dX->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(ctx, dX, &transformed_dX);
} else {
transformed_dX = *dX;
}
// if is_expand is false, the operation of set_zero is unnecessary
// because math::matmul will reset dx
if (is_expand) {
set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
}
math::Col2VolFunctor<DeviceContext, T> col2vol;
math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
for (int i = 0; i < batch_size; i++) {
Tensor dy_batch =
transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix.ShareDataWith(dx_slice);
col_matrix.Resize(col_matrix_shape);
}
blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
T(0.0));
if (is_expand && data_dim == 2U) {
col2im(dev_ctx, col, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&dx_slice);
} else if (is_expand && data_dim == 3U) {
col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
}
}
}
if (channel_last) {
TransToChannelLast<DeviceContext, T>(ctx, &transformed_dX, dX);
}
}
// dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
// oH, oW)
// dw convolution double grad: im2col(vol2col) + gemm
if (dW && ddX) {
dW->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, dW, static_cast<T>(0));
Tensor dW_arr = *dW;
dW_arr.Resize(filter_matrix_shape);
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
math::Vol2ColFunctor<DeviceContext, T> vol2col;
for (int i = 0; i < batch_size; ++i) {
Tensor dy_batch =
transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; ++g) {
// im2col
Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(ddx_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx, ddx_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
}
Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
T(1.0));
}
}
}
// ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
// w/ddw(Cout, Cin, kh, kw)
// ddy convolution double grad: im2col(vol2col) + gemm
if (ddY) {
ddY->mutable_data<T>(ctx.GetPlace());
Tensor transformed_ddY(ddY->dtype());
if (channel_last) {
ResizeToChannelFirst<DeviceContext, T>(ctx, ddY, &transformed_ddY);
} else {
transformed_ddY = *ddY;
}
set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
math::Vol2ColFunctor<DeviceContext, T> vol2col;
for (int i = 0; i < batch_size; ++i) {
Tensor ddy_batch =
transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; ++g) {
// gemm
Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
if (ddX) {
Tensor ddx_batch =
transformed_ddX.Slice(i, i + 1).Resize(input_shape);
Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(ddx_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx, ddx_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
}
Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
T(0.0));
}
if (ddW_in) {
Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape);
Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
Tensor ddW;
ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
if (!is_expand) {
col.ShareDataWith(x_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx, x_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
T(1.0));
}
}
}
if (channel_last) {
TransToChannelLast<DeviceContext, T>(ctx, &transformed_ddY, ddY);
}
}
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
const std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
const std::string data_format = context.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
if (channel_last) {
PADDLE_ENFORCE_EQ(
output->dims()[output->dims().size() - 1] %
input->dims()[input->dims().size() - 1],
0, platform::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[output->dims().size() - 1],
input->dims()[input->dims().size() - 1]));
} else {
PADDLE_ENFORCE_EQ(
output->dims()[1] % input->dims()[1], 0,
platform::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[1], input->dims()[1]));
}
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_format);
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
auto& dev_ctx = context.template device_context<DeviceContext>();
if (fuse_relu) {
math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
output, data_layout);
} else {
math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
output, data_layout);
}
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
const std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
const std::string data_format = context.Attr<std::string>("data_format");
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_format);
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>();
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (fuse_relu) {
math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, dilations, input_grad, data_layout);
} else {
math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, dilations, input_grad, data_layout);
}
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
if (fuse_relu) {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
paddings, dilations, filter_grad, data_layout);
} else {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
paddings, dilations, filter_grad, data_layout);
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(args, false, deterministic, workspace_size, ctx); algo = search::Find<T>(
args, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(args, false, deterministic, ctx); algo = search::Find<T>(
args, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo)); std::max(workspace_size, search::GetWorkspaceSize(args, algo));
#endif #endif
...@@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1)); std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo = data_algo = search1::Find<T>(
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(args1, false, deterministic, ctx); data_algo = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif #endif
...@@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2)); std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo = filter_algo = search2::Find<T>(
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(args2, false, deterministic, ctx); filter_algo = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo)); search2::GetWorkspaceSize(args2, filter_algo));
#endif #endif
...@@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1); workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 = bwd_algo1 = search1::Find<T>(
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx); bwd_algo1 = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
#endif #endif
} }
...@@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2)); std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 = bwd_algo2 = search2::Find<T>(
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx); bwd_algo2 = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, bwd_algo2)); search2::GetWorkspaceSize(args2, bwd_algo2));
#endif #endif
...@@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3)); std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = filter_algo = search3::Find<T>(
search3::Find<T>(args3, false, deterministic, workspace_size, ctx); args3, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search3::Find<T>(args3, false, deterministic, ctx); filter_algo = search3::Find<T>(
args3, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo)); search3::GetWorkspaceSize(args3, filter_algo));
#endif #endif
...@@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4)); std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = data_algo = search4::Find<T>(
search4::Find<T>(args4, false, deterministic, workspace_size, ctx); args4, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else #else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search4::Find<T>(args4, false, deterministic, ctx); data_algo = search4::Find<T>(
args4, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif #endif
......
...@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(
groups, filter.dims()[0],
platform::errors::InvalidArgument(
"groups should be error to the 1st dimension of filter. But "
"received groups is %d and filter dimension[0] is %d",
groups, filter.dims()[0]));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
"dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output, filter, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, output, data_layout);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
if (input_grad) {
math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
depthwiseConv(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, filter, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, input_grad, data_layout);
}
if (filter_grad) {
phi::funcs::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, filter_grad, data_layout);
}
}
};
} // namespace operators
} // namespace paddle
// conv2d // conv2d
REGISTER_OP_CUDA_KERNEL(conv2d_transpose, REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>, ops::GemmConvTransposeKernel<CUDA, float>,
......
...@@ -21,7 +21,6 @@ limitations under the License. */ ...@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
...@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(
groups, filter.dims()[0],
platform::errors::InvalidArgument(
"groups should be error to the 1st dimension of filter. But "
"received groups is %d and filter dimension[0] is %d",
groups, filter.dims()[0]));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
"dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
dev_ctx, *output, filter, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, output, data_layout);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(
dev_ctx, *output_grad, filter, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, input_grad, data_layout);
}
if (filter_grad) {
phi::funcs::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
dev_ctx, *output_grad, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, filter_grad, data_layout);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
...@@ -29,10 +30,10 @@ namespace platform = paddle::platform; ...@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
namespace op = paddle::operators; namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
USE_OP(conv2d); USE_OP_ITSELF(conv2d);
USE_OP(conv2d_grad); USE_OP_ITSELF(conv2d_grad);
USE_OP_DEVICE_KERNEL(conv2d, CUDNN); PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
template <typename T> template <typename T>
void InitRandomTensor(const std::vector<int64_t> &dims, void InitRandomTensor(const std::vector<int64_t> &dims,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
using DataLayout = framework::DataLayout;
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
class CPUDeviceContext; class CPUDeviceContext;
...@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> { ...@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
} }
}; };
template <class T>
class Vol2ColFunctor<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
int filter_depth = col->dims()[1];
int filter_height = col->dims()[2];
int filter_width = col->dims()[3];
int output_depth = col->dims()[4];
int output_height = col->dims()[5];
int output_width = col->dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
// changed
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
const T* vol_data = vol.data<T>();
T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int c_in = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
int col_idx =
((c * output_depth + d) * output_height + h) * output_width + w;
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
c_in;
}
col_data[col_idx] =
(h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
? static_cast<T>(0)
: vol_data[vol_idx];
}
}
}
}
}
};
/* /*
* vol = [input_channels,input_depth, input_height, input_width] * vol = [input_channels,input_depth, input_height, input_width]
* col = * col =
...@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> { ...@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
} }
}; };
template <class T>
class Col2VolFunctor<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context, const framework::Tensor& col,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
int filter_depth = col.dims()[1];
int filter_height = col.dims()[2];
int filter_width = col.dims()[3];
int output_depth = col.dims()[4];
int output_height = col.dims()[5];
int output_width = col.dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int cIm = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx =
((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
cIm;
}
int col_idx =
((c * output_depth + d) * output_height + h) * output_width +
w;
vol_data[vol_idx] += col_data[col_idx];
}
}
}
}
}
}
};
template class Vol2ColFunctor<platform::CPUDeviceContext, float>; template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
template class Vol2ColFunctor<platform::CPUDeviceContext, double>; template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
template class Vol2ColFunctor<phi::CPUContext, float>;
template class Vol2ColFunctor<phi::CPUContext, double>;
template class Col2VolFunctor<platform::CPUDeviceContext, float>; template class Col2VolFunctor<platform::CPUDeviceContext, float>;
template class Col2VolFunctor<platform::CPUDeviceContext, double>; template class Col2VolFunctor<platform::CPUDeviceContext, double>;
template class Col2VolFunctor<phi::CPUContext, float>;
template class Col2VolFunctor<phi::CPUContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -33,7 +33,7 @@ USE_OP(relu); ...@@ -33,7 +33,7 @@ USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP_ITSELF(softmax); USE_OP_ITSELF(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP(conv2d); USE_OP_ITSELF(conv2d);
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
namespace paddle { namespace paddle {
...@@ -55,7 +55,7 @@ class CacheTester { ...@@ -55,7 +55,7 @@ class CacheTester {
onednn_dev_ctx_->ResetBlobMap(nullptr); onednn_dev_ctx_->ResetBlobMap(nullptr);
} }
bool Analyze(unsigned short int num_entries) { bool Analyze(uint16_t num_entries) {
// Number of created objects in cache should be as expected (num_entries) // Number of created objects in cache should be as expected (num_entries)
return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
} }
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <string> #include <string>
#include <tuple> #include <tuple>
#include "paddle/phi/common/place.h"
#include "paddle/utils/any.h" #include "paddle/utils/any.h"
#include "paddle/utils/flat_hash_map.h" #include "paddle/utils/flat_hash_map.h"
#include "paddle/utils/small_vector.h" #include "paddle/utils/small_vector.h"
......
...@@ -10,7 +10,7 @@ add_subdirectory(funcs) ...@@ -10,7 +10,7 @@ add_subdirectory(funcs)
set_property(GLOBAL PROPERTY PHI_KERNELS "") set_property(GLOBAL PROPERTY PHI_KERNELS "")
set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
# remove this dep after removing fluid deps on tensor creation # remove this dep after removing fluid deps on tensor creation
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvGradGradKernel(const Context& dev_ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void Conv3DGradGradKernel(const Context& dev_ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out);
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out);
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DGradGradKernel(const Context& ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradGradKernel<T>(ctx,
input_grad_grad,
filter_grad_grad,
out_grad,
input,
filter,
strides,
paddings_t,
padding_algorithm,
groups,
dilations_t,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search_t,
out_grad_grad,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
}
PD_REGISTER_KERNEL(conv3d_grad_grad,
CPU,
ALL_LAYOUT,
phi::Conv3DGradGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_grad,
CPU,
ALL_LAYOUT,
phi::DepthwiseConvGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(
conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
PD_REGISTER_KERNEL(depthwise_conv2d,
CPU,
ALL_LAYOUT,
phi::DepthwiseConvKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/ddim.h"
namespace phi {
template <typename T = int>
inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
std::vector<T>* dilation,
const std::string padding_algorithm,
const DDim data_dims,
const std::vector<T>& strides,
const std::vector<T>& ksize) {
// set padding size == data_dims.size() * 2
auto data_shape = vectorize<T>(data_dims);
if (static_cast<int>(paddings->size()) == data_dims.size()) {
for (int i = 0; i < data_dims.size(); ++i) {
T copy_pad = *(paddings->begin() + 2 * i);
paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
}
} else {
PADDLE_ENFORCE_EQ(
data_dims.size() * 2,
paddings->size(),
phi::errors::InvalidArgument(
"Attribute padding's size should be the same or twice as the "
"input's dimension. "
"But recieved: padding's size is %d, padding is [%s]; input's "
"dimension is %d, input's shape is [%s].",
paddings->size(),
make_ddim(*paddings),
data_dims.size(),
data_dims));
}
// when padding_algorithm is "VALID" or "SAME"
if (padding_algorithm == "SAME") {
for (int i = 0; i < data_dims.size(); ++i) {
T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
T pad_sum =
std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
static_cast<T>(0));
T pad_0 = pad_sum / 2;
T pad_1 = pad_sum - pad_0;
*(paddings->begin() + i * 2) = pad_0;
*(paddings->begin() + i * 2 + 1) = pad_1;
// dilation
*(dilation->begin() + i) = 1;
}
} else if (padding_algorithm == "VALID") {
for (auto it = paddings->begin(); it != paddings->end(); it++) {
*it = 0;
}
}
}
inline bool IsExpand(const std::vector<int64_t>& filter_dim,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
if (paddings.size() != strides.size()) {
for (size_t j = 0; j < paddings.size(); ++j) {
padding_0 = padding_0 && (paddings[j] == 0);
}
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Tensor = DenseTensor;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
VLOG(5) << "Why am I called?";
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 4, 1, 2, 3};
phi::funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 3, 1, 2};
phi::funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
phi::funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 2, 3, 4, 1};
phi::funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 2, 3, 1};
phi::funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
phi::funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
} // namespace phi
...@@ -15,10 +15,10 @@ limitations under the License. */ ...@@ -15,10 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
PD_REGISTER_KERNEL(
conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/hostdevice.h"
#ifdef __NVCC__ #ifdef __NVCC__
#include <cub/cub.cuh> #include <cub/cub.cuh>
#endif #endif
...@@ -21,7 +25,7 @@ limitations under the License. */ ...@@ -21,7 +25,7 @@ limitations under the License. */
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub; namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -30,6 +34,58 @@ namespace paddle { ...@@ -30,6 +34,58 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using DataLayout = framework::DataLayout;
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext,
typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* output,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext,
typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext,
typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename T> template <typename T>
static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) { static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
typedef cub::WarpReduce<T> WarpReduce; typedef cub::WarpReduce<T> WarpReduce;
...@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( ...@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
} }
} }
template <typename T, int c_filter_multiplier, int c_stride, int c_filter, template <typename T,
DataLayout data_layout, bool fuse_relu_before_conv> int c_filter_multiplier,
int c_stride,
int c_filter,
DataLayout data_layout,
bool fuse_relu_before_conv>
__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
int final_filter_multiplier = filter_multiplier; int final_filter_multiplier = filter_multiplier;
int h_stride = stride_height; int h_stride = stride_height;
...@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { ...@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
} }
if (c_filter == -1) { if (c_filter == -1) {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>( KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
input_data, filter_data, batch_size, output_channels, output_height, filter_data,
output_width, input_channels, input_height, input_width, batch_size,
final_filter_multiplier, filter_height, filter_width, h_stride, output_channels,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_height,
output_data); output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
output_data);
} else { } else {
KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>( KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
input_data, filter_data, batch_size, output_channels, output_height, filter_data,
output_width, input_channels, input_height, input_width, batch_size,
final_filter_multiplier, filter_height, filter_width, h_stride, output_channels,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_height,
output_data); output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
output_data);
} }
} else { } else {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>( KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
input_data, filter_data, batch_size, output_channels, output_height, input_data,
output_width, input_channels, input_height, input_width, filter_data,
final_filter_multiplier, filter_height, filter_width, h_stride, batch_size,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
output_data); output_data);
} else { } else {
KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>( KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
input_data, filter_data, batch_size, output_channels, output_height, input_data,
output_width, input_channels, input_height, input_width, filter_data,
final_filter_multiplier, filter_height, filter_width, h_stride, batch_size,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
output_data); output_data);
} }
} }
...@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( ...@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
} }
} }
template <typename T, int c_filter, int c_filter_multiplier, template <typename T,
int c_filter,
int c_filter_multiplier,
bool fuse_relu_before_conv> bool fuse_relu_before_conv>
__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
ARG_DEFINE_KernelDepthwiseConvInputGrad) { ARG_DEFINE_KernelDepthwiseConvInputGrad) {
...@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ...@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
} }
} }
template <typename T, int c_filter, int c_filter_multiplier, template <typename T,
int c_filter,
int c_filter_multiplier,
bool fuse_relu_before_conv> bool fuse_relu_before_conv>
__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
ARG_DEFINE_KernelDepthwiseConvInputGrad) { ARG_DEFINE_KernelDepthwiseConvInputGrad) {
...@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( ...@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
} }
} }
template <typename T, int c_filter_multiplier, int c_stride, int c_filter, template <typename T,
DataLayout data_layout, bool fuse_relu_before_conv> int c_filter_multiplier,
int c_stride,
int c_filter,
DataLayout data_layout,
bool fuse_relu_before_conv>
__global__ void KernelDepthwiseConvInputGradSp( __global__ void KernelDepthwiseConvInputGradSp(
ARG_DEFINE_KernelDepthwiseConvInputGrad) { ARG_DEFINE_KernelDepthwiseConvInputGrad) {
int final_filter_multiplier = filter_multiplier; int final_filter_multiplier = filter_multiplier;
...@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp( ...@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp(
if (c_filter_multiplier == 0 || c_filter == -1) { if (c_filter_multiplier == 0 || c_filter == -1) {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>( KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
input_data, output_grad_data, filter_data, batch_size, input_data,
output_channels, output_height, output_width, input_channels, output_grad_data,
input_height, input_width, final_filter_multiplier, filter_height, filter_data,
filter_width, h_stride, w_stride, padding_height, padding_width, batch_size,
dilate_height, dilate_width, input_grad_data); output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
input_grad_data);
} else { } else {
KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>( KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
input_data, output_grad_data, filter_data, batch_size, input_data,
output_channels, output_height, output_width, input_channels, output_grad_data,
input_height, input_width, final_filter_multiplier, filter_height, filter_data,
filter_width, h_stride, w_stride, padding_height, padding_width, batch_size,
dilate_height, dilate_width, input_grad_data); output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
input_grad_data);
} }
} else { } else {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier, KernelDepthwiseConvInputGradCFilterNCHW<T,
c_filter,
c_filter_multiplier,
fuse_relu_before_conv>( fuse_relu_before_conv>(
input_data, output_grad_data, filter_data, batch_size, input_data,
output_channels, output_height, output_width, input_channels, output_grad_data,
input_height, input_width, c_filter_multiplier, filter_height, filter_data,
filter_width, c_stride, c_stride, padding_height, padding_width, batch_size,
dilate_height, dilate_width, input_grad_data); output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
c_filter_multiplier,
filter_height,
filter_width,
c_stride,
c_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
input_grad_data);
} else { } else {
KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier, KernelDepthwiseConvInputGradCFilterNHWC<T,
c_filter,
c_filter_multiplier,
fuse_relu_before_conv>( fuse_relu_before_conv>(
input_data, output_grad_data, filter_data, batch_size, input_data,
output_channels, output_height, output_width, input_channels, output_grad_data,
input_height, input_width, c_filter_multiplier, filter_height, filter_data,
filter_width, c_stride, c_stride, padding_height, padding_width, batch_size,
dilate_height, dilate_width, input_grad_data); output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
c_filter_multiplier,
filter_height,
filter_width,
c_stride,
c_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
input_grad_data);
} }
} }
} }
...@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp( ...@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp(
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T, bool fuse_relu_before_conv> template <typename T, bool fuse_relu_before_conv>
__device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
const T* output_grad_data, const T* input_data, const int num, const T* output_grad_data,
const int output_channels, const int output_height, const int output_width, const T* input_data,
const int input_channels, const int input_height, const int input_width, const int num,
const int filter_multiplier, const int filter_height, const int output_channels,
const int filter_width, const int stride_height, const int stride_width, const int output_height,
const int padding_height, const int padding_width, const int dilate_height, const int output_width,
const int dilate_width, T* filter_grad_data) { const int input_channels,
const int input_height,
const int input_width,
const int filter_multiplier,
const int filter_height,
const int filter_width,
const int stride_height,
const int stride_width,
const int padding_height,
const int padding_width,
const int dilate_height,
const int dilate_width,
T* filter_grad_data) {
T s = 0; T s = 0;
int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
...@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( ...@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
template <typename T, bool fuse_relu_before_conv> template <typename T, bool fuse_relu_before_conv>
__device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
const T* output_grad_data, const T* input_data, const int num, const T* output_grad_data,
const int output_channels, const int output_height, const int output_width, const T* input_data,
const int input_channels, const int input_height, const int input_width, const int num,
const int filter_multiplier, const int filter_height, const int output_channels,
const int filter_width, const int stride_height, const int stride_width, const int output_height,
const int padding_height, const int padding_width, const int dilate_height, const int output_width,
const int dilate_width, T* filter_grad_data) { const int input_channels,
const int input_height,
const int input_width,
const int filter_multiplier,
const int filter_height,
const int filter_width,
const int stride_height,
const int stride_width,
const int padding_height,
const int padding_width,
const int dilate_height,
const int dilate_width,
T* filter_grad_data) {
int bid = blockIdx.z; int bid = blockIdx.z;
int image_h = blockIdx.y; int image_h = blockIdx.y;
int kernel_iw = blockIdx.x % filter_width; int kernel_iw = blockIdx.x % filter_width;
...@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( ...@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
template <typename T, int c_filter, bool fuse_relu_before_conv> template <typename T, int c_filter, bool fuse_relu_before_conv>
__device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
const T* output_grad_data, const T* input_data, const int num, const T* output_grad_data,
const int output_channels, const int output_height, const int output_width, const T* input_data,
const int input_channels, const int input_height, const int input_width, const int num,
const int filter_multiplier, const int filter_height, const int output_channels,
const int filter_width, const int stride_height, const int stride_width, const int output_height,
const int padding_height, const int padding_width, const int dilate_height, const int output_width,
const int dilate_width, T* filter_grad_data) { const int input_channels,
const int input_height,
const int input_width,
const int filter_multiplier,
const int filter_height,
const int filter_width,
const int stride_height,
const int stride_width,
const int padding_height,
const int padding_width,
const int dilate_height,
const int dilate_width,
T* filter_grad_data) {
const int bid = blockIdx.z; const int bid = blockIdx.z;
int image_h = blockIdx.x * dilate_height + blockIdx.y; int image_h = blockIdx.x * dilate_height + blockIdx.y;
if (image_h >= output_height) { if (image_h >= output_height) {
...@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( ...@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
} }
} }
template <typename T, int c_filter_multiplier, int c_stride, int c_filter, template <typename T,
DataLayout data_layout, bool fuse_relu_before_conv> int c_filter_multiplier,
__global__ void KernelDepthwiseConvFilterGradSp( int c_stride,
const T* output_grad_data, const T* input_data, const int num, int c_filter,
const int output_channels, const int output_height, const int output_width, DataLayout data_layout,
const int input_channels, const int input_height, const int input_width, bool fuse_relu_before_conv>
const int filter_multiplier, const int filter_height, __global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
const int filter_width, const int stride_height, const int stride_width, const T* input_data,
const int padding_height, const int padding_width, const int dilate_height, const int num,
const int dilate_width, T* filter_grad_data) { const int output_channels,
const int output_height,
const int output_width,
const int input_channels,
const int input_height,
const int input_width,
const int filter_multiplier,
const int filter_height,
const int filter_width,
const int stride_height,
const int stride_width,
const int padding_height,
const int padding_width,
const int dilate_height,
const int dilate_width,
T* filter_grad_data) {
int final_filter_multiplier = filter_multiplier; int final_filter_multiplier = filter_multiplier;
int h_stride = stride_height; int h_stride = stride_height;
int w_stride = stride_width; int w_stride = stride_width;
...@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp( ...@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp(
if (c_filter_multiplier == 0 || c_filter == -1) { if (c_filter_multiplier == 0 || c_filter == -1) {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>( KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
output_grad_data, input_data, num, output_channels, output_height, output_grad_data,
output_width, input_channels, input_height, input_width, input_data,
final_filter_multiplier, filter_height, filter_width, h_stride, num,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
filter_grad_data); filter_grad_data);
} else { } else {
KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>( KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
output_grad_data, input_data, num, output_channels, output_height, output_grad_data,
output_width, input_channels, input_height, input_width, input_data,
final_filter_multiplier, filter_height, filter_width, h_stride, num,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
filter_grad_data); filter_grad_data);
} }
} else { } else {
if (data_layout != DataLayout::kNHWC) { if (data_layout != DataLayout::kNHWC) {
KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>( KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
output_grad_data, input_data, num, output_channels, output_height, output_grad_data,
output_width, input_channels, input_height, input_width, input_data,
final_filter_multiplier, filter_height, filter_width, h_stride, num,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
filter_grad_data); filter_grad_data);
} else { } else {
KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter, KernelDepthwiseConvFilterGradCFilterNHWC<T,
c_filter,
fuse_relu_before_conv>( fuse_relu_before_conv>(
output_grad_data, input_data, num, output_channels, output_height, output_grad_data,
output_width, input_channels, input_height, input_width, input_data,
final_filter_multiplier, filter_height, filter_width, h_stride, num,
w_stride, padding_height, padding_width, dilate_height, dilate_width, output_channels,
output_height,
output_width,
input_channels,
input_height,
input_width,
final_filter_multiplier,
filter_height,
filter_width,
h_stride,
w_stride,
padding_height,
padding_width,
dilate_height,
dilate_width,
filter_grad_data); filter_grad_data);
} }
} }
...@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp( ...@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp(
* height and width, respectively. * height and width, respectively.
*/ */
template <class T, bool fuse_relu_before_conv> template <class T, bool fuse_relu_before_conv>
class DepthwiseConvFunctor<platform::CUDADeviceContext, T, class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
fuse_relu_before_conv> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const phi::GPUContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output, const std::vector<int>& dilations,
framework::Tensor* output,
const DataLayout data_layout = DataLayout::kNCHW) { const DataLayout data_layout = DataLayout::kNCHW) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = const int input_channels =
...@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T, ...@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
framework::Tensor filter_hwc; framework::Tensor filter_hwc;
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3], framework::DDim filter_hwc_dims({filter.dims()[2],
filter.dims()[0], filter.dims()[1]}); filter.dims()[3],
filter.dims()[0],
filter.dims()[1]});
filter_hwc.Resize(filter_hwc_dims); filter_hwc.Resize(filter_hwc_dims);
filter_hwc.mutable_data<T>(context.GetPlace()); filter_hwc.mutable_data<T>(context.GetPlace());
std::vector<int> perm_axis({2, 3, 0, 1}); std::vector<int> perm_axis({2, 3, 0, 1});
phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans; phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
trans(context, filter, &filter_hwc, perm_axis); trans(context, filter, &filter_hwc, perm_axis);
filter_data = filter_hwc.data<T>(); filter_data = filter_hwc.data<T>();
} }
...@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T, ...@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
((output_width + dilate_width - 1) / dilate_width) * dilate_width); ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
threads = dim3(std::min(output_channels, thread), blocks, 1); threads = dim3(std::min(output_channels, thread), blocks, 1);
grid = dim3((output_height + dilate_height - 1) / dilate_height, grid = dim3((output_height + dilate_height - 1) / dilate_height,
dilate_height, batch_size); dilate_height,
batch_size);
} }
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
int nums_output = int nums_output =
...@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T, ...@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
#endif #endif
int grid_size = (nums_output + block_size - 1) / block_size; int grid_size = (nums_output + block_size - 1) / block_size;
#define check_case(c_filter_multiplier, c_stride, c_filter) \ #define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \ if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \ filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \ stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \ (ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \ c_filter == -1)) { \
if (c_filter == -1) { \ if (c_filter == -1) { \
threads.x = block_size; \ threads.x = block_size; \
grid.x = grid_size; \ grid.x = grid_size; \
threads.y = threads.z = grid.y = grid.z = 1; \ threads.y = threads.z = grid.y = grid.z = 1; \
} \ } \
if (data_layout != DataLayout::kNHWC) { \ if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvSp< \ KernelDepthwiseConvSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ T, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ c_filter_multiplier, \
input_data, filter_data, batch_size, output_channels, output_height, \ c_stride, \
output_width, input_channels, input_height, input_width, \ c_filter, \
filter_multiplier, ksize_height, ksize_width, stride_height, \ DataLayout::kNCHW, \
stride_width, padding_height, padding_width, dilate_height, \ fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
dilate_width, output_data); \ input_data, \
} else { \ filter_data, \
KernelDepthwiseConvSp< \ batch_size, \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ output_channels, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ output_height, \
input_data, filter_data, batch_size, output_channels, output_height, \ output_width, \
output_width, input_channels, input_height, input_width, \ input_channels, \
filter_multiplier, ksize_height, ksize_width, stride_height, \ input_height, \
stride_width, padding_height, padding_width, dilate_height, \ input_width, \
dilate_width, output_data); \ filter_multiplier, \
} \ ksize_height, \
return; \ ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
output_data); \
} else { \
KernelDepthwiseConvSp< \
T, \
c_filter_multiplier, \
c_stride, \
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, \
filter_data, \
batch_size, \
output_channels, \
output_height, \
output_width, \
input_channels, \
input_height, \
input_width, \
filter_multiplier, \
ksize_height, \
ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
output_data); \
} \
return; \
} }
check_case(1, 1, 3); check_case(1, 1, 3);
check_case(1, 1, 5); check_case(1, 1, 5);
...@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T, ...@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
}; };
template <typename T, bool fuse_relu_before_conv> template <typename T, bool fuse_relu_before_conv>
class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T, class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
fuse_relu_before_conv> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const phi::GPUContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
...@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T, ...@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
framework::Tensor filter_hwc; framework::Tensor filter_hwc;
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3], framework::DDim filter_hwc_dims({filter.dims()[2],
filter.dims()[0], filter.dims()[1]}); filter.dims()[3],
filter.dims()[0],
filter.dims()[1]});
filter_hwc.Resize(filter_hwc_dims); filter_hwc.Resize(filter_hwc_dims);
filter_hwc.mutable_data<T>(context.GetPlace()); filter_hwc.mutable_data<T>(context.GetPlace());
std::vector<int> perm_axis({2, 3, 0, 1}); std::vector<int> perm_axis({2, 3, 0, 1});
phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans; phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
trans(context, filter, &filter_hwc, perm_axis); trans(context, filter, &filter_hwc, perm_axis);
filter_data = filter_hwc.data<T>(); filter_data = filter_hwc.data<T>();
} }
...@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T, ...@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
((input_width + dilate_width - 1) / dilate_width) * dilate_width); ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
threads = dim3(std::min(input_channels, thread), blocks, 1); threads = dim3(std::min(input_channels, thread), blocks, 1);
grid = dim3((input_height + dilate_height - 1) / dilate_height, grid = dim3((input_height + dilate_height - 1) / dilate_height,
dilate_height, batch_size); dilate_height,
batch_size);
} }
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
...@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T, ...@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
c_filter == -1)) { \ c_filter == -1)) { \
if (data_layout != DataLayout::kNHWC) { \ if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvInputGradSp< \ KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ T, \
c_filter_multiplier, \
c_stride, \
c_filter, \
DataLayout::kNCHW, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, output_grad_data, filter_data, batch_size, \ input_data, \
output_channels, output_height, output_width, input_channels, \ output_grad_data, \
input_height, input_width, filter_multiplier, ksize_height, \ filter_data, \
ksize_width, stride_height, stride_width, padding_height, \ batch_size, \
padding_width, dilate_height, dilate_width, input_grad_data); \ output_channels, \
output_height, \
output_width, \
input_channels, \
input_height, \
input_width, \
filter_multiplier, \
ksize_height, \
ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
input_grad_data); \
} else { \ } else { \
KernelDepthwiseConvInputGradSp< \ KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ T, \
c_filter_multiplier, \
c_stride, \
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, output_grad_data, filter_data, batch_size, \ input_data, \
output_channels, output_height, output_width, input_channels, \ output_grad_data, \
input_height, input_width, filter_multiplier, ksize_height, \ filter_data, \
ksize_width, stride_height, stride_width, padding_height, \ batch_size, \
padding_width, dilate_height, dilate_width, input_grad_data); \ output_channels, \
output_height, \
output_width, \
input_channels, \
input_height, \
input_width, \
filter_multiplier, \
ksize_height, \
ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
input_grad_data); \
} \ } \
return; \ return; \
} }
...@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T, ...@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
}; };
template <typename T, bool fuse_relu_before_conv> template <typename T, bool fuse_relu_before_conv>
class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T, class DepthwiseConvFilterGradFunctor<phi::GPUContext,
T,
fuse_relu_before_conv> { fuse_relu_before_conv> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const phi::GPUContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T, ...@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
std::max(block_size / output_channels, 1), std::max(block_size / output_channels, 1),
((output_width + dilate_width - 1) / dilate_width) * dilate_width); ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
grid = dim3((output_height + dilate_height - 1) / dilate_height, grid = dim3((output_height + dilate_height - 1) / dilate_height,
dilate_height, batch_size); dilate_height,
batch_size);
threads = dim3(std::min(output_channels, block_size), blocks, 1); threads = dim3(std::min(output_channels, block_size), blocks, 1);
} }
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
...@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T, ...@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
c_filter == -1)) { \ c_filter == -1)) { \
if (data_layout != DataLayout::kNHWC) { \ if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvFilterGradSp< \ KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ T, \
c_filter_multiplier, \
c_stride, \
c_filter, \
DataLayout::kNCHW, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \ output_grad_data, \
output_height, output_width, input_channels, input_height, \ input_data, \
input_width, filter_multiplier, ksize_height, ksize_width, \ batch_size, \
stride_height, stride_width, padding_height, padding_width, \ output_channels, \
dilate_height, dilate_width, filter_grad_data); \ output_height, \
output_width, \
input_channels, \
input_height, \
input_width, \
filter_multiplier, \
ksize_height, \
ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
filter_grad_data); \
} else { \ } else { \
framework::Tensor filter_grad_hwc; \ framework::Tensor filter_grad_hwc; \
if (c_filter != -1) { \ if (c_filter != -1) { \
framework::DDim filter_grad_hwc_dims( \ framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2], \
{filter_grad->dims()[2], filter_grad->dims()[3], \ filter_grad->dims()[3], \
filter_grad->dims()[0], filter_grad->dims()[1]}); \ filter_grad->dims()[0], \
filter_grad->dims()[1]}); \
filter_grad_hwc.Resize(filter_grad_hwc_dims); \ filter_grad_hwc.Resize(filter_grad_hwc_dims); \
filter_grad_hwc.mutable_data<T>(context.GetPlace()); \ filter_grad_hwc.mutable_data<T>(context.GetPlace()); \
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero; \ phi::funcs::SetConstant<phi::GPUContext, T> set_zero; \
set_zero(context, &filter_grad_hwc, static_cast<T>(0)); \ set_zero(context, &filter_grad_hwc, static_cast<T>(0)); \
filter_grad_data = filter_grad_hwc.data<T>(); \ filter_grad_data = filter_grad_hwc.data<T>(); \
} else { \ } else { \
...@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T, ...@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
threads = dim3(std::min(output_channels, block_size), blocks, 1); \ threads = dim3(std::min(output_channels, block_size), blocks, 1); \
} \ } \
KernelDepthwiseConvFilterGradSp< \ KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ T, \
c_filter_multiplier, \
c_stride, \
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \ fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \ output_grad_data, \
output_height, output_width, input_channels, input_height, \ input_data, \
input_width, filter_multiplier, ksize_height, ksize_width, \ batch_size, \
stride_height, stride_width, padding_height, padding_width, \ output_channels, \
dilate_height, dilate_width, filter_grad_data); \ output_height, \
output_width, \
input_channels, \
input_height, \
input_width, \
filter_multiplier, \
ksize_height, \
ksize_width, \
stride_height, \
stride_width, \
padding_height, \
padding_width, \
dilate_height, \
dilate_width, \
filter_grad_data); \
if (c_filter != -1) { \ if (c_filter != -1) { \
std::vector<int> perm_axis({2, 3, 0, 1}); \ std::vector<int> perm_axis({2, 3, 0, 1}); \
phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans; \ phi::funcs::TransposeNormal<phi::GPUContext, T> trans; \
trans(context, filter_grad_hwc, filter_grad, perm_axis); \ trans(context, filter_grad_hwc, filter_grad, perm_axis); \
} \ } \
} \ } \
...@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T, ...@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
} }
}; };
template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>; template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>; template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float, template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
false>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
double, false>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
float, false>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
double, false>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>; template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>; template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float, template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
true>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
double, true>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
float, true>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
double, true>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
const DenseTensor* output_grad = &out_grad;
if (!input_grad && !filter_grad) return;
std::vector<int> strides = strides_t;
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
// update padding and dilation
auto in_dims = input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims;
const paddle::framework::DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
if (data_layout != paddle::framework::DataLayout::kNHWC) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
phi::funcs::SetConstant<Context, T> set_zero;
if (input_grad) {
input_grad->mutable_data<T>(dev_ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (fuse_relu) {
paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx,
input,
filter,
*output_grad,
strides,
paddings,
dilations,
input_grad,
data_layout);
} else {
paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx,
input,
filter,
*output_grad,
strides,
paddings,
dilations,
input_grad,
data_layout);
}
}
if (filter_grad) {
filter_grad->mutable_data<T>(dev_ctx.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
if (fuse_relu) {
paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx,
input,
*output_grad,
strides,
paddings,
dilations,
filter_grad,
data_layout);
} else {
paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx,
input,
*output_grad,
strides,
paddings,
dilations,
filter_grad,
data_layout);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(depthwise_conv2d_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out) {
DenseTensor* output = out;
output->mutable_data<T>(dev_ctx.GetPlace());
const std::vector<int> strides = strides_t;
std::vector<int> dilations = dilations_t;
std::vector<int> paddings = paddings_t;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
if (channel_last) {
PADDLE_ENFORCE_EQ(
output->dims()[output->dims().size() - 1] %
input.dims()[input.dims().size() - 1],
0,
phi::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[output->dims().size() - 1],
input.dims()[input.dims().size() - 1]));
} else {
PADDLE_ENFORCE_EQ(
output->dims()[1] % input.dims()[1],
0,
phi::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[1],
input.dims()[1]));
}
// update padding and dilation
auto in_dims = input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims;
const paddle::framework::DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
if (data_layout != paddle::framework::DataLayout::kNHWC) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
if (fuse_relu) {
paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
depthwiseConv;
depthwiseConv(dev_ctx,
input,
filter,
strides,
paddings,
dilations,
output,
data_layout);
} else {
paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
depthwiseConv;
depthwiseConv(dev_ctx,
input,
filter,
strides,
paddings,
dilations,
output,
data_layout);
}
}
} // namespace phi
PD_REGISTER_KERNEL(depthwise_conv2d,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void ConvCudnnGradGradKernel(
const Context& ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
auto X = &input;
auto W = &filter;
auto dO = &out_grad;
auto ddX = input_grad_grad.get_ptr();
auto ddW = filter_grad_grad.get_ptr();
auto ddO = out_grad_grad;
auto dW = filter_grad;
auto dX = input_grad;
if (ddO) {
ddO->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, ddO, static_cast<T>(0));
}
if (dW) {
dW->mutable_data<T>(ctx.GetPlace());
}
if (dX) {
dX->mutable_data<T>(ctx.GetPlace());
}
// const T* x = X->data<T>();
const T* dy = dO->data<T>();
const T* w = W->data<T>();
const T* ddx = nullptr;
const T* ddw = nullptr;
T *dw, *dx, *ddy;
dw = dx = ddy = nullptr;
T* transformed_dx = nullptr;
std::vector<int> dilations = dilations_t;
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
std::vector<int> paddings = paddings_t;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform Tensors to channel first-----------
DenseTensor transformed_X_channel(X->type());
DenseTensor transformed_dO_channel(dO->type());
DenseTensor transformed_ddX_channel(X->type());
DenseTensor transformed_ddO_channel(dO->type());
DenseTensor transformed_dX_channel(X->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
TransToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
ResizeToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
TransToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
if (ddX) {
ResizeToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
TransToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
}
if (ddO) {
ResizeToChannelFirst<Context, T>(ctx, ddO, &transformed_ddO_channel);
}
if (dX) {
ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
}
} else {
transformed_X_channel = *X;
transformed_dO_channel = *dO;
if (ddX) {
transformed_ddX_channel = *ddX;
}
if (ddO) {
transformed_ddO_channel.ShareDataWith(*ddO);
}
if (dX) {
transformed_dX_channel.ShareDataWith(*dX);
}
}
auto in_dims = transformed_X_channel.dims();
auto filter_dims = W->dims();
DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
DenseTensor transformed_X(X->type());
DenseTensor transformed_ddX(X->type());
DenseTensor transformed_dX(X->type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(X->dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_X_channel.dims()[0];
new_input_shape_vec[1] = transformed_X_channel.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_X_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
}
DDim new_input_shape(make_ddim(new_input_shape_vec));
transformed_X.Resize(new_input_shape);
transformed_ddX.Resize(new_input_shape);
transformed_dX.Resize(new_input_shape);
transformed_X.mutable_data<T>(ctx.GetPlace());
if (ddX) {
transformed_ddX.mutable_data<T>(ctx.GetPlace());
}
if (dX) {
transformed_dX.mutable_data<T>(ctx.GetPlace());
}
// pad for input
const int rank = X->dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
if (ddX) {
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_ddX_channel,
pad_value,
&transformed_ddX);
}
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
if (ddX) {
funcs::PadFunction<Context, T, 5>(ctx,
input_pad,
transformed_ddX_channel,
pad_value,
&transformed_ddX);
}
} break;
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_X.ShareDataWith(transformed_X_channel);
if (ddX) {
transformed_ddX.ShareDataWith(transformed_ddX_channel);
}
if (dX) {
transformed_dX.ShareDataWith(transformed_dX_channel);
}
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* x = transformed_X.data<T>();
int iwo_group = groups;
int c_group = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_group = 1;
c_group = groups;
groups = 1;
#endif
auto dtype = paddle::platform::CudnnDataType<T>::type;
auto handle = ctx.cudnn_handle();
paddle::operators::ConvArgs args1{&transformed_ddX,
W,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype};
paddle::operators::ConvArgs args2{&transformed_X,
ddW,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype};
paddle::operators::ConvArgs args3{&transformed_ddX,
dW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype};
paddle::operators::ConvArgs args4{&transformed_dX,
ddW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype};
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvBwdDataAlgorithm_t data_algo =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionFwdAlgo_t fwd_algo1 =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionFwdAlgo_t fwd_algo2 =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionBwdDataAlgo_t data_algo =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
auto layout = paddle::platform::GetCudnnTensorFormat(
paddle::platform::DataLayout::kNCHW);
// ddo = conv(ddI, W) + conv(I, ddW)
size_t workspace_size = 0;
T* transformed_ddy_channel = nullptr;
if (ddO) {
ddy = ddO->data<T>();
transformed_ddy_channel = transformed_ddO_channel.data<T>();
if (ddX) {
args1.handle = handle;
args1.idesc.set(transformed_ddX, iwo_group);
args1.wdesc.set(*W, layout, iwo_group);
args1.odesc.set(transformed_ddO_channel, iwo_group);
args1.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
fwd_algo1 = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
#endif
}
if (ddW) {
ddw = ddW->data<T>();
args2.handle = handle;
args2.idesc.set(transformed_X, iwo_group);
args2.wdesc.set(*ddW, layout, iwo_group);
args2.odesc.set(transformed_ddO_channel, iwo_group);
args2.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_algo2 = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
#endif
}
}
if (dW && ddX) {
dw = dW->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_ddX, iwo_group);
args3.wdesc.set(*dW, layout, iwo_group);
args3.odesc.set(transformed_dO_channel, iwo_group);
args3.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
#endif
}
if (ddW && dX) {
transformed_dx = transformed_dX.data<T>();
args4.handle = handle;
args4.idesc.set(transformed_dX, iwo_group);
args4.wdesc.set(*ddW, layout, iwo_group);
args4.odesc.set(transformed_dO_channel, iwo_group);
args4.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search4 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
}
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(
transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_dO_channel.dims(),
DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = ctx.cudnn_workspace_handle();
if (ddO) {
if (ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args1.idesc.desc(),
ddx,
args1.wdesc.desc(),
w,
args1.cdesc.desc(),
fwd_algo1,
&beta,
args1.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args1.idesc.desc(),
ddx + i * group_offset_in,
args1.wdesc.desc(),
w + i * group_offset_filter,
args1.cdesc.desc(),
fwd_algo1,
workspace_ptr,
workspace_size,
&beta,
args1.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif
}
if (ddW) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args2.idesc.desc(),
x,
args2.wdesc.desc(),
ddw,
args2.cdesc.desc(),
fwd_algo2,
&beta,
args2.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args2.idesc.desc(),
x + i * group_offset_in,
args2.wdesc.desc(),
ddw + i * group_offset_filter,
args2.cdesc.desc(),
fwd_algo2,
workspace_ptr,
workspace_size,
&alpha,
args2.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif
}
if (channel_last) {
TransToChannelLast<Context, T>(ctx, &transformed_ddO_channel, ddO);
}
}
T* transformed_dy_channel = transformed_dO_channel.data<T>();
if (dW && ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args3.odesc.desc(),
transformed_dy_channel,
args3.idesc.desc(),
ddx,
args3.cdesc.desc(),
filter_algo,
&beta,
args3.wdesc.desc(),
dw,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args3.idesc.desc(),
ddx + i * group_offset_in,
args3.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args3.cdesc.desc(),
filter_algo,
workspace_ptr,
workspace_size,
&beta,
args3.wdesc.desc(),
dw + i * group_offset_filter));
},
workspace_size);
}
#endif
}
if (dX && ddW) {
ddw = ddW->data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args4.odesc.desc(),
transformed_dy_channel,
args4.wdesc.desc(),
ddw,
args4.cdesc.desc(),
data_algo,
&beta,
args4.idesc.desc(),
transformed_dx,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args4.wdesc.desc(),
ddw + i * group_offset_filter,
args4.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args4.cdesc.desc(),
data_algo,
workspace_ptr,
workspace_size,
&beta,
args4.idesc.desc(),
transformed_dx + i * group_offset_in));
},
workspace_size);
}
#endif
if (!is_sys_pad) {
// reverse padded input
std::vector<int> starts(X->dims().size(), 0);
std::vector<int> axes(X->dims().size(), 0);
for (size_t i = 0; i < X->dims().size(); ++i) {
starts[i] = input_pad[2 * i];
axes[i] = i;
}
if (X->dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
} else {
paddle::operators::RemovePaddingSlice<Context, T, 5>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
}
}
if (channel_last) {
TransToChannelLast<Context, T>(ctx, &transformed_dX_channel, dX);
}
}
}
template <typename T, typename Context>
void DepthwiseConvCudnnGradGradKernel(
const Context& ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
bool fuse_relu,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvCudnnGradGradKernel<T>(ctx,
input_grad_grad,
filter_grad_grad,
out_grad,
input,
filter,
strides,
paddings_t,
padding_algorithm,
groups,
dilations_t,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search_t,
out_grad_grad,
input_grad,
filter_grad);
}
template <typename T, typename Context>
void Conv3DCudnnGradGradKernel(
const Context& ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvCudnnGradGradKernel<T>(ctx,
input_grad_grad,
filter_grad_grad,
out_grad,
input,
filter,
strides,
paddings_t,
padding_algorithm,
groups,
dilations_t,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search_t,
out_grad_grad,
input_grad,
filter_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(conv2d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvCudnnGradGradKernel,
float,
phi::dtype::float16) {}
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(conv2d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(conv3d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvCudnnGradGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else
PD_REGISTER_KERNEL(conv2d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvCudnnGradGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
namespace phi {
template <typename T, typename Context>
void ConvCudnnGradKernel(const Context& ctx,
const DenseTensor& output_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> dilations = dilations_t;
std::vector<int> strides = strides_t;
std::vector<int> paddings = paddings_t;
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
// HIP MIOPEN ONLY SUPPORT NCHW format
auto compute_format = paddle::platform::DataLayout::kNCHW;
#else
const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
auto compute_format = compute_in_nhwc && channel_last
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
#endif
VLOG(3) << "Compute ConvGradOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
: "NCHW");
// transform Tensor
DenseTensor transformed_input_channel(input.type());
DenseTensor transformed_output_grad_channel(output_grad.type());
DenseTensor transformed_input_grad_channel(input.type());
DenseTensor transformed_filter_channel(filter.type());
DenseTensor transformed_filter_grad_channel(filter.type());
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
"NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
ResizeToChannelFirst<Context, T>(
ctx, &output_grad, &transformed_output_grad_channel);
TransToChannelFirst<Context, T>(
ctx, &output_grad, &transformed_output_grad_channel);
if (input_grad) {
ResizeToChannelFirst<Context, T>(
ctx, input_grad, &transformed_input_grad_channel);
// NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
// the data of input_grad to transformed_input_grad_channel.
if (use_addto) {
TransToChannelFirst<Context, T>(
ctx, input_grad, &transformed_input_grad_channel);
}
}
} else {
transformed_input_channel.ShareDataWith(input);
transformed_output_grad_channel.ShareDataWith(output_grad);
if (input_grad) {
transformed_input_grad_channel.ShareDataWith(*input_grad);
}
}
if (compute_format == paddle::platform::DataLayout::kNHWC) {
VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
if (filter_grad) {
ResizeToChannelLast<Context, T>(
ctx, filter_grad, &transformed_filter_grad_channel);
}
} else {
transformed_filter_channel.ShareDataWith(filter);
if (filter_grad) {
transformed_filter_grad_channel.ShareDataWith(*filter_grad);
}
}
// update paddings
auto in_dims = transformed_input_channel.dims();
auto filter_dims = transformed_filter_channel.dims();
DDim in_data_dims;
DDim filter_data_dims;
if (compute_format == paddle::platform::DataLayout::kNCHW) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
// cuDNN only supports padding the same amount on every dimension.
// So we create a new padded input tensor.
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input(input.type());
Tensor transformed_input_grad(input.type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == paddle::platform::DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
DDim new_input_shape(make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
transformed_input.mutable_data<T>(ctx.GetPlace());
transformed_input_grad.Resize(new_input_shape);
if (input_grad) {
transformed_input_grad.mutable_data<T>(ctx.GetPlace());
}
// pad for input
const int rank = transformed_input_channel.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_input.ShareDataWith(transformed_input_channel);
if (input_grad) {
transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
}
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = transformed_input.data<T>();
const T* output_grad_data = transformed_output_grad_channel.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
T* filter_grad_data = nullptr;
T* input_grad_data = nullptr;
T* transformed_input_grad_data = nullptr;
paddle::operators::ConvArgs args1{&transformed_input_grad,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype};
paddle::operators::ConvArgs args2{&transformed_input,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype};
auto handle = ctx.cudnn_handle();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
paddle::platform::DataLayout layout =
compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNDHWC
: paddle::platform::DataLayout::kNCDHW;
}
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
auto workspace_handle = ctx.cudnn_workspace_handle();
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
} else {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t data_algo =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionBwdDataAlgo_t data_algo =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
// input data workspace_size
size_t workspace_size_d = 0;
// weight workspace_size
size_t workspace_size_w = 0;
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
if (input_grad) {
// ------------------- cudnn descriptors ---------------------
input_grad_data = input_grad->data<T>();
transformed_input_grad_data = transformed_input_grad.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_input_grad, layout_tensor);
args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
args1.odesc.set(transformed_output_grad_channel, layout_tensor);
args1.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size_d, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
#endif
}
if (filter_grad) {
// ------------------- cudnn descriptors ---------------------
filter_grad_data = transformed_filter_grad_channel.data<T>();
args2.handle = handle;
args2.idesc.set(transformed_input, layout_tensor);
args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
args2.odesc.set(transformed_output_grad_channel, layout_tensor);
args2.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size_w, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size_w = std::max(workspace_size_w,
search2::GetWorkspaceSize(args2, filter_algo));
#endif
}
// ------------------- cudnn conv backward data ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f;
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
paddle::operators::ScalingParamType<T> beta = 0.0f;
#else
paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
#endif
VLOG(4) << "Conv_grad: use_addto = " << use_addto;
if (input_grad) {
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
#ifdef PADDLE_WITH_HIP
if (use_addto) {
DenseTensor temp_tensor(transformed_input_grad.type());
temp_tensor.Resize(transformed_input_grad.dims());
T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args1.odesc.desc(),
output_grad_data,
args1.wdesc.desc(),
filter_data,
args1.cdesc.desc(),
data_algo,
&beta,
args1.idesc.desc(),
temp_tensor_data,
cudnn_workspace_ptr,
workspace_size_d));
},
workspace_size_d);
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
handle,
miopenTensorOpAdd,
&alpha,
args1.idesc.desc(),
transformed_input_grad_data,
&alpha,
args1.idesc.desc(),
temp_tensor_data,
&beta,
args1.idesc.desc(),
transformed_input_grad_data));
} else {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args1.odesc.desc(),
output_grad_data,
args1.wdesc.desc(),
filter_data,
args1.cdesc.desc(),
data_algo,
&beta,
args1.idesc.desc(),
transformed_input_grad_data,
cudnn_workspace_ptr,
workspace_size_d));
},
workspace_size_d);
}
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args1.wdesc.desc(),
filter_data + i * group_offset_filter,
args1.odesc.desc(),
output_grad_data + i * group_offset_out,
args1.cdesc.desc(),
data_algo,
cudnn_workspace_ptr,
workspace_size_d,
&beta,
args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size_d);
}
#endif
if (!is_sys_pad) {
std::vector<int> starts(transformed_input_channel.dims().size(), 0);
std::vector<int> axes(transformed_input_channel.dims().size(), 0);
for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
starts[i] = input_pad[2 * i];
axes[i] = i;
}
transformed_input_grad_channel.mutable_data(ctx.GetPlace());
if (transformed_input_channel.dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>(
ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
} else {
paddle::operators::RemovePaddingSlice<Context, T, 5>(
ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
}
}
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
TransToChannelLast<Context, T>(
ctx, &transformed_input_grad_channel, input_grad);
}
}
// filter_grad do not use inplace addto.
paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
#ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args2.odesc.desc(),
output_grad_data,
args2.idesc.desc(),
input_data,
args2.cdesc.desc(),
filter_algo,
&beta,
args2.wdesc.desc(),
filter_grad_data,
cudnn_workspace_ptr,
workspace_size_w));
},
workspace_size_w);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args2.idesc.desc(),
input_data + i * group_offset_in,
args2.odesc.desc(),
output_grad_data + i * group_offset_out,
args2.cdesc.desc(),
filter_algo,
cudnn_workspace_ptr,
workspace_size_w,
&beta_filter,
args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size_w);
}
#endif
if (compute_format == paddle::platform::DataLayout::kNHWC) {
TransToChannelFirst<Context, T>(
ctx, &transformed_filter_grad_channel, filter_grad);
}
}
}
template <typename T, typename Context>
void Conv3DCudnnGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvCudnnGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(conv2d_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradKernel,
float,
phi::dtype::float16) {}
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(conv2d_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(conv3d_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else
PD_REGISTER_KERNEL(conv2d_grad,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
namespace phi {
template <typename T, typename Context>
void ConvCudnnKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* output) {
output->mutable_data<T>(ctx.GetPlace());
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
// HIP MIOPEN ONLY SUPPORT NCHW format
auto compute_format = paddle::platform::DataLayout::kNCHW;
#else
// Tensor Core introduced from Volta GPUs supports more faster conv op
// with FP16 in NHWC data format.
const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
// We will only do data format conversion from NHWC to NCHW.
// cudnn will convert NCHW to NHWC automatically on Tensor Core.
auto compute_format = compute_in_nhwc && channel_last
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
#endif
VLOG(3) << "Compute ConvOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
: "NCHW");
// ------------ transformed tensor -----------
DenseTensor transformed_input_channel(input.type());
DenseTensor transformed_output(output->type());
DenseTensor transformed_filter_channel(filter.type());
T* output_data = nullptr;
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
} else {
transformed_input_channel.ShareDataWith(input);
transformed_output.ShareDataWith(*output);
}
if (compute_format == paddle::platform::DataLayout::kNHWC) {
VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
} else {
transformed_filter_channel.ShareDataWith(filter);
}
output_data = transformed_output.data<T>();
// update padding and dilation
auto in_dims = transformed_input_channel.dims();
auto filter_dims = transformed_filter_channel.dims();
DDim in_data_dims;
DDim filter_data_dims;
if (compute_format == paddle::platform::DataLayout::kNCHW) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
DenseTensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == paddle::platform::DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
DDim new_input_shape(make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
transformed_input.mutable_data<T>(ctx.GetPlace());
const int rank = transformed_input_channel.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_input.ShareDataWith(transformed_input_channel);
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = transformed_input.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
padding_common,
dilations,
dtype};
auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
paddle::platform::DataLayout layout =
compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNDHWC
: paddle::platform::DataLayout::kNCDHW;
}
auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
args.handle = handle;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
#else
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn());
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it manually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetConvolutionGroupCount(
args.cdesc.desc(), groups));
groups = 1;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups = 1;
#endif
args.idesc.set(transformed_input, layout_format);
args.wdesc.set(transformed_filter_channel, layout_format, groups);
args.odesc.set(transformed_output, layout_format);
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
} else {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t algo{};
using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args);
algo = search::Find<T>(
args, exhaustive_search, deterministic, workspace_size, ctx);
#else
cudnnConvolutionFwdAlgo_t algo{};
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
workspace_size = search::GetWorkspaceSize(args, algo);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
// FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// FWD_ALGO_IMPLICIT_GEMM manually.
if (groups > 1) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
}
#endif
// ------------------- cudnn conv forward ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
#ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data,
args.wdesc.desc(),
filter_data,
args.cdesc.desc(),
algo,
&beta,
args.odesc.desc(),
output_data,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data + i * group_offset_in,
args.wdesc.desc(),
filter_data + i * group_offset_filter,
args.cdesc.desc(),
algo,
workspace_ptr,
workspace_size,
&beta,
args.odesc.desc(),
output_data + i * group_offset_out));
},
workspace_size);
}
#endif
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
TransToChannelLast<Context, T>(ctx, &transformed_output, output);
}
}
template <typename T, typename Context>
void Conv3DCudnnKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvCudnnKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
phi::dtype::float16) {}
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
double,
phi::dtype::float16) {}
#endif
#endif
// todo register bfloat16
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
namespace phi {
static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
return dev_ctx.GetComputeCapability() >= 70;
}
// inline cudnnTensorFormat_t GetCudnnTensorFormat(
// const phi::DataLayout& order) { // Not use
// switch (order) {
// case phi::DataLayout::kNHWC:
// return CUDNN_TENSOR_NHWC;
// case phi::DataLayout::kNCHW:
// return CUDNN_TENSOR_NCHW;
// case phi::DataLayout::NCDHW:
// return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same
// case phi::DataLayout::NDHWC:
// return CUDNN_TENSOR_NHWC; // add, liyamei
// default:
// PADDLE_THROW(phi::errors::Unimplemented(
// "CUDNN has no equivalent dataLayout for input order."));
// }
// return CUDNN_TENSOR_NCHW;
// }
static inline void GetNCDHW(const DDim& dims,
const phi::DataLayout& layout,
int* N,
int* C,
int* D,
int* H,
int* W) {
*N = dims[0];
*C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) {
*D = dims[2 - i];
*H = dims[3 - i];
*W = dims[4 - i];
} else {
*D = 1;
*H = dims[2 - i];
*W = dims[3 - i];
}
}
} // namespace phi
// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
// ) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void ConvGradGradKernel(const Context& dev_ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
const DenseTensor* X = &input;
const DenseTensor* dY = &out_grad;
const DenseTensor* ddX = input_grad_grad.get_ptr();
const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
DenseTensor* ddY = out_grad_grad;
DenseTensor* dW = filter_grad;
DenseTensor* dX = input_grad;
DenseTensor W = filter;
if (!ddY && !dW && !dX) return;
const std::vector<int> strides = strides_t;
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform Tensor
DenseTensor transformed_X(X->type());
DenseTensor transformed_dY(dY->type());
DenseTensor transformed_ddX(X->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
if (ddX) {
ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
}
} else {
transformed_X = *X;
transformed_dY = *dY;
if (ddX) {
transformed_ddX = *ddX;
}
}
// update padding and dilation
auto in_dims = transformed_X.dims();
auto filter_dims = W.dims();
DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(transformed_X.dims()[0]);
std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
// col_shape [in_channel/group, kh, kw, oh, ow]
col_shape_vec[0] = transformed_X.dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
}
DDim col_shape(make_ddim(col_shape_vec));
// col_matrix_shape [in_channel/group * kh * kw, oh * ow]
DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
// input_shape [Cin, H, W]
DDim input_shape =
slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
// filter_matrix_shape [Cout, Cin * kh * kw]
DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
W.Resize(filter_matrix_shape);
DDim output_matrix_shape = {
transformed_dY.dims()[1],
transformed_dY.numel() /
(transformed_dY.dims()[0] * transformed_dY.dims()[1])};
int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
DenseTensor col;
DenseTensor col_matrix;
if (is_expand) {
col.Resize(col_shape);
col.mutable_data<T>(dev_ctx.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
phi::funcs::SetConstant<Context, T> set_zero;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
// dx convolution double grad: gemm + col2im(col2vol)
// dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
// oH, oW)
if (dX && ddW_in) {
Tensor ddW;
ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
dX->mutable_data<T>(dev_ctx.GetPlace());
DenseTensor transformed_dX(dX->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
} else {
transformed_dX = *dX;
}
// if is_expand is false, the operation of set_zero is unnecessary
// because math::matmul will reset dx
if (is_expand) {
set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
}
paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
paddle::operators::math::
Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
col2im;
for (int i = 0; i < batch_size; i++) {
DenseTensor dy_batch =
transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix.ShareDataWith(dx_slice);
col_matrix.Resize(col_matrix_shape);
}
blas.MatMul(
ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
if (is_expand && data_dim == 2U) {
col2im(dev_ctx,
col,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&dx_slice);
} else if (is_expand && data_dim == 3U) {
col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
}
}
}
if (channel_last) {
TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
}
}
// dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
// oH, oW)
// dw convolution double grad: im2col(vol2col) + gemm
if (dW && ddX) {
dW->mutable_data<T>(dev_ctx.GetPlace());
set_zero(dev_ctx, dW, static_cast<T>(0));
DenseTensor dW_arr = *dW;
dW_arr.Resize(filter_matrix_shape);
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
im2col;
paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
for (int i = 0; i < batch_size; ++i) {
DenseTensor dy_batch =
transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; ++g) {
// im2col
DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(ddx_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx,
ddx_slice,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
}
DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(
dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
}
}
}
// ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
// w/ddw(Cout, Cin, kh, kw)
// ddy convolution double grad: im2col(vol2col) + gemm
if (ddY) {
ddY->mutable_data<T>(dev_ctx.GetPlace());
DenseTensor transformed_ddY(ddY->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
} else {
transformed_ddY = *ddY;
}
set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
im2col;
paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
for (int i = 0; i < batch_size; ++i) {
DenseTensor ddy_batch =
transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; ++g) {
// gemm
DenseTensor ddy_slice =
ddy_batch.Slice(g * out_step, (g + 1) * out_step);
if (ddX) {
DenseTensor ddx_batch =
transformed_ddX.Slice(i, i + 1).Resize(input_shape);
DenseTensor ddx_slice =
ddx_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(ddx_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx,
ddx_slice,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
}
DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(
w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
}
if (ddW_in) {
DenseTensor x_batch =
transformed_X.Slice(i, i + 1).Resize(input_shape);
DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
DenseTensor ddW;
ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
if (!is_expand) {
col.ShareDataWith(x_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx,
x_slice,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
}
// gemm
DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(
ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
}
}
}
if (channel_last) {
TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void ConvGradKernel(const Context& dev_ctx,
const DenseTensor& output_grad,
const DenseTensor& input,
const DenseTensor& filter_t,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
// The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
if (!input_grad && !filter_grad) return;
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
DenseTensor filter = filter_t;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
DenseTensor transformed_input(input.type());
DenseTensor transformed_output_grad(output_grad.type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
ResizeToChannelFirst<Context, T>(
dev_ctx, &output_grad, &transformed_output_grad);
TransToChannelFirst<Context, T>(
dev_ctx, &output_grad, &transformed_output_grad);
} else {
transformed_input = input;
transformed_output_grad = output_grad;
}
// update padding and dilation
auto in_dims = transformed_input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation<int>(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(transformed_input.dims()[0]);
// filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
// output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(
vectorize(transformed_output_grad.dims()));
// use col_shape in the im2col calculation
// col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
// o_h, o_w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = transformed_input.dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
DDim col_shape(make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (i_c/g * k_h * k_w, o_h * o_w)
// or
// (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
DDim input_shape =
slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
DDim output_matrix_shape = {
transformed_output_grad.dims()[1],
transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
transformed_output_grad.dims()[1])};
// convolution backward input operator: gemm + col2im(or col2vol)
// convolution backward weight operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
DenseTensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DenseTensor col_matrix;
if (is_expand) {
col.Resize(col_shape);
col.mutable_data<T>(dev_ctx.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
phi::funcs::SetConstant<Context, T> set_zero;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
if (input_grad) {
input_grad->mutable_data<T>(dev_ctx.GetPlace());
DenseTensor transformed_input_grad(input_grad->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(
dev_ctx, input_grad, &transformed_input_grad);
} else {
transformed_input_grad = *input_grad;
}
// if is_expand is false, the operation of set_zero is unnecessary,
// because math::matmul will reset input_grad.
if (is_expand) {
set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
}
paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
paddle::operators::math::
Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
col2im;
for (int i = 0; i < batch_size; i++) {
DenseTensor out_grad_batch =
transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
DenseTensor in_grad_batch =
transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
DenseTensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
DenseTensor filter_slice =
filter.Slice(g * out_step, (g + 1) * out_step);
DenseTensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix.ShareDataWith(in_grad_slice);
col_matrix.Resize(col_matrix_shape);
}
blas.MatMul(filter_slice,
true,
out_grad_slice,
false,
T(1.0),
&col_matrix,
T(0.0));
if (is_expand && data_dim == 2U) {
col2im(dev_ctx,
col,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&in_grad_slice);
} else if (is_expand && data_dim == 3U) {
col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
}
}
}
if (channel_last) {
TransToChannelLast<Context, T>(
dev_ctx, &transformed_input_grad, input_grad);
}
}
if (filter_grad) {
filter_grad->mutable_data<T>(dev_ctx.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
im2col;
paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
for (int i = 0; i < batch_size; i++) {
DenseTensor out_grad_batch =
transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
DenseTensor in_batch =
transformed_input.Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// im2col
DenseTensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx,
in_slice,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
}
// gemm
DenseTensor filter_grad_slice =
filter_grad_.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(out_grad_slice,
false,
col_matrix,
true,
T(1.0),
&filter_grad_slice,
T(1.0));
}
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void ConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter_t,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* output) {
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
DenseTensor filter = filter_t;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
output->mutable_data<T>(dev_ctx.GetPlace());
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
DenseTensor transformed_input(input.type());
DenseTensor transformed_output(output->type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
} else {
transformed_input = input;
transformed_output = *output;
}
// update padding and dilation
auto trans_in_dims = transformed_input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(transformed_input.dims()[0]);
// filter_shape_vec:
// {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
// output_shape_vec:
// {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
// use col_shape in the im2col calculation
// col_shape_vec:
// {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
// o_d,o_h, o_w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = trans_in_dims[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
DDim col_shape(make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size:
// (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
// o_w)
DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
DenseTensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DenseTensor col_matrix;
if (is_expand) {
// col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
col.Resize(col_shape);
col.mutable_data<T>(dev_ctx.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
DDim in_matrix_shape =
slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
DDim output_matrix_shape = {
transformed_output.dims()[1],
transformed_output.numel() /
(transformed_output.dims()[0] * transformed_output.dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
im2col;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
for (int i = 0; i < batch_size; i++) {
DenseTensor in_batch =
transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
DenseTensor out_batch =
transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
im2col(dev_ctx,
in_slice,
dilations,
strides,
std::vector<int>{
paddings[0], paddings[2], paddings[1], paddings[3]},
&col);
} else if (data_dim == 3U) {
vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
}
// gemm
DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(
filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
}
}
if (channel_last) {
TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d",
{"Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{"Output"});
}
KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_grad",
{GradVarName("Output"), "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{GradVarName("Input"), GradVarName("Filter")});
}
KernelSignature Conv2dDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_grad_grad",
{"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{"DDOutput", "DInput", "DFilter"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
phi::Conv2dDoubleGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("conv3d",
{"Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{"Output"});
}
KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_grad",
{GradVarName("Output"), "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{GradVarName("Input"), GradVarName("Filter")});
}
KernelSignature Conv3dDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv3d_grad_grad",
{"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search"},
{"DDOutput", "DInput", "DFilter"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad,
phi::Conv3dDoubleGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature DepthwiseConv2dOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("depthwise_conv2d",
{"Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search",
"fuse_relu_before_depthwise_conv"},
{"Output"});
}
KernelSignature DepthwiseConv2dGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("depthwise_conv2d_grad",
{GradVarName("Output"), "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search",
"fuse_relu_before_depthwise_conv"},
{GradVarName("Input"), GradVarName("Filter")});
}
KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("depthwise_conv2d_grad_grad",
{"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
{"strides",
"paddings",
"padding_algorithm",
"groups",
"dilations",
"data_format",
"use_addto",
"workspace_size_MB",
"exhaustive_search",
"fuse_relu_before_depthwise_conv"},
{"DDOutput", "DInput", "DFilter"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d,
phi::DepthwiseConv2dOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad,
phi::DepthwiseConv2dGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad,
phi::DepthwiseConv2dDoubleGradOpArgumentMapping);
...@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern): ...@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg ...@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg
import paddle.nn.functional as F import paddle.nn.functional as F
import paddle.fluid.initializer as I import paddle.fluid.initializer as I
import unittest import unittest
import paddle
def _reverse_repeat_list(t, n): def _reverse_repeat_list(t, n):
...@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern): ...@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp): ...@@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp):
self.groups = 3 self.groups = 3
#----------------Conv2DCUDNN---------------- # #----------------Conv2DCUDNN----------------
create_test_cudnn_class(TestConv2DOp) create_test_cudnn_class(TestConv2DOp)
create_test_cudnn_class(TestWithPad) create_test_cudnn_class(TestWithPad)
......
...@@ -20,6 +20,7 @@ import numpy as np ...@@ -20,6 +20,7 @@ import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle
def conv3d_forward_naive(input, def conv3d_forward_naive(input,
...@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase): ...@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
...@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase): ...@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase):
def test_grad(self): def test_grad(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
places = []
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
...@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase): ...@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
[x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps) [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
def test_grad(self): def test_grad(self):
places = [fluid.CPUPlace()] #places = [fluid.CPUPlace()]
places = []
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
...@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase): ...@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12): ...@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11): ...@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase): ...@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册