未验证 提交 d50fb43e 编写于 作者: H hong 提交者: GitHub

Move conv to pten (#39354)

* move conv to pten

* move conv to pten; test=develop

* fix bug;

* add conv cudnn impl; test=develop

* update

* update operator; test=develop

* fix bug; test=develop

* move operator and prepared_operator to develop; test=develop

* resolve conflict; test=develop

* remove useless code;test=develop

* add depency ; test=develop

* fix bug;

* add sig.cc ; test=develop

* fix use_op error; test=develop

* fix bug; test=develop

* fix bug; test=develop

* add conv3d register; test=develop

* fix star gan and conv_nn_grad test failed; test=develop

* add header; test=develop

* manul to recover to develop;

* resolve confilct; test=develop

* remove useless code

* fix bug;

* remove conv2d_cudnn; test=develop

* fix bugs; test=develop

* fix cpu rocm compile bugs; test=develop

* fix blas error; test=develop

* fix compile bug; test=develop

* fix windows compile error; test=develop

* fix windows error; test=develop

* resolve confilct; test=develop
上级 eaacf8bf
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_OP(conv2d);
USE_OP_ITSELF(conv2d);
USE_OP(conv2d_transpose);
namespace paddle {
......
......@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace paddle {
namespace operators {
......@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
}
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const framework::ExecutionContext& context,
static void RemovePaddingSlice(const phi::GPUContext& context,
const Tensor* input, Tensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
......@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
using framework::ConvSearchCache;
static void SetConvMathType(const framework::ExecutionContext& ctx,
cudnnDataType_t dtype,
static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx;
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
......@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic,
const framework::ExecutionContext& ctx) {
bool deterministic, const phi::GPUContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
bool has_got_workspace_size = true;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
......@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
} else if (deterministic) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
......@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic,
const framework::ExecutionContext& ctx) {
bool deterministic, const phi::GPUContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
......@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
} else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
......@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic,
const framework::ExecutionContext& ctx) {
bool deterministic, const phi::GPUContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard;
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
......@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
} else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardFilter());
......
此差异已折叠。
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace paddle {
namespace operators {
......@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
}
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const framework::ExecutionContext& context,
static void RemovePaddingSlice(const phi::GPUContext& context,
const Tensor* input, Tensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::array<int, D>();
......@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) {
const phi::GPUContext& ctx) {
algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
int find_count;
miopenConvAlgoPerf_t find_result;
......@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) {
const phi::GPUContext& ctx) {
algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
int find_count;
miopenConvAlgoPerf_t find_result;
......@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) {
const phi::GPUContext& ctx) {
algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
int find_count;
miopenConvAlgoPerf_t find_result;
......
......@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
paddle::framework::DataTypeToString(input_data_type),
paddle::framework::DataTypeToString(filter_data_type)));
}
#ifndef PADDLE_WITH_ASCEND_CL
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(
library, framework::LibraryType::kCUDNN,
platform::errors::InvalidArgument(
"float16 can only be used when CUDNN or NPU is used"));
}
#endif
// #ifndef PADDLE_WITH_ASCEND_CL
// if (input_data_type == framework::proto::VarType::FP16) {
// PADDLE_ENFORCE_EQ(
// library, framework::LibraryType::kCUDNN,
// platform::errors::InvalidArgument(
// "float16 can only be used when CUDNN or NPU is used"));
// }
// #endif
#if PADDLE_WITH_CUDA
if (input_data_type == framework::proto::VarType::BF16 &&
library == framework::LibraryType::kCUDNN) {
......@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad_grad,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad_grad,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(conv2d)
.AddCheckpoint(
R"ROC(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d_grad,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
此差异已折叠。
......@@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
algo = search::Find<T>(
args, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(args, false, deterministic, ctx);
algo = search::Find<T>(
args, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo));
#endif
......@@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
data_algo = search1::Find<T>(
args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(args1, false, deterministic, ctx);
data_algo = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif
......@@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
filter_algo = search2::Find<T>(
args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
filter_algo = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo));
#endif
......@@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
bwd_algo1 = search1::Find<T>(
args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
bwd_algo1 = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
#endif
}
......@@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
bwd_algo2 = search2::Find<T>(
args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
bwd_algo2 = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, bwd_algo2));
#endif
......@@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
filter_algo = search3::Find<T>(
args3, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
filter_algo = search3::Find<T>(
args3, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo));
#endif
......@@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
data_algo = search4::Find<T>(
args4, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search4::Find<T>(args4, false, deterministic, ctx);
data_algo = search4::Find<T>(
args4, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
......
......@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(
groups, filter.dims()[0],
platform::errors::InvalidArgument(
"groups should be error to the 1st dimension of filter. But "
"received groups is %d and filter dimension[0] is %d",
groups, filter.dims()[0]));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
"dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output, filter, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, output, data_layout);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
if (input_grad) {
math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
depthwiseConv(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, filter, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, input_grad, data_layout);
}
if (filter_grad) {
phi::funcs::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, filter_grad, data_layout);
}
}
};
} // namespace operators
} // namespace paddle
// conv2d
REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
......
......@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
......@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(
groups, filter.dims()[0],
platform::errors::InvalidArgument(
"groups should be error to the 1st dimension of filter. But "
"received groups is %d and filter dimension[0] is %d",
groups, filter.dims()[0]));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
"dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
dev_ctx, *output, filter, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, output, data_layout);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(
dev_ctx, *output_grad, filter, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, input_grad, data_layout);
}
if (filter_grad) {
phi::funcs::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
dev_ctx, *output_grad, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, filter_grad, data_layout);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace framework = paddle::framework;
......@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor;
USE_OP(conv2d);
USE_OP(conv2d_grad);
USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
USE_OP_ITSELF(conv2d);
USE_OP_ITSELF(conv2d_grad);
PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
template <typename T>
void InitRandomTensor(const std::vector<int64_t> &dims,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
using DataLayout = framework::DataLayout;
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
template <typename DeviceContext, typename T,
bool fuse_relu_before_conv = false>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad,
const DataLayout data_layout = DataLayout::kNCHW);
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
......@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
}
};
template <class T>
class Vol2ColFunctor<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
int filter_depth = col->dims()[1];
int filter_height = col->dims()[2];
int filter_width = col->dims()[3];
int output_depth = col->dims()[4];
int output_height = col->dims()[5];
int output_width = col->dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
// changed
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
const T* vol_data = vol.data<T>();
T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int c_in = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
int col_idx =
((c * output_depth + d) * output_height + h) * output_width + w;
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
c_in;
}
col_data[col_idx] =
(h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
? static_cast<T>(0)
: vol_data[vol_idx];
}
}
}
}
}
};
/*
* vol = [input_channels,input_depth, input_height, input_width]
* col =
......@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
}
};
template <class T>
class Col2VolFunctor<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context, const framework::Tensor& col,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
int filter_depth = col.dims()[1];
int filter_height = col.dims()[2];
int filter_width = col.dims()[3];
int output_depth = col.dims()[4];
int output_height = col.dims()[5];
int output_width = col.dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int cIm = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx =
((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
cIm;
}
int col_idx =
((c * output_depth + d) * output_height + h) * output_width +
w;
vol_data[vol_idx] += col_data[col_idx];
}
}
}
}
}
}
};
template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
template class Vol2ColFunctor<phi::CPUContext, float>;
template class Vol2ColFunctor<phi::CPUContext, double>;
template class Col2VolFunctor<platform::CPUDeviceContext, float>;
template class Col2VolFunctor<platform::CPUDeviceContext, double>;
template class Col2VolFunctor<phi::CPUContext, float>;
template class Col2VolFunctor<phi::CPUContext, double>;
} // namespace math
} // namespace operators
......
......@@ -33,7 +33,7 @@ USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP_ITSELF(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP(conv2d);
USE_OP_ITSELF(conv2d);
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
namespace paddle {
......@@ -55,7 +55,7 @@ class CacheTester {
onednn_dev_ctx_->ResetBlobMap(nullptr);
}
bool Analyze(unsigned short int num_entries) {
bool Analyze(uint16_t num_entries) {
// Number of created objects in cache should be as expected (num_entries)
return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
}
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <string>
#include <tuple>
#include "paddle/phi/common/place.h"
#include "paddle/utils/any.h"
#include "paddle/utils/flat_hash_map.h"
#include "paddle/utils/small_vector.h"
......
......@@ -10,7 +10,7 @@ add_subdirectory(funcs)
set_property(GLOBAL PROPERTY PHI_KERNELS "")
set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
# remove this dep after removing fluid deps on tensor creation
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvGradGradKernel(const Context& dev_ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void Conv3DGradGradKernel(const Context& dev_ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad);
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out);
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out);
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DGradGradKernel(const Context& ctx,
paddle::optional<const DenseTensor&> input_grad_grad,
paddle::optional<const DenseTensor&> filter_grad_grad,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* out_grad_grad,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradGradKernel<T>(ctx,
input_grad_grad,
filter_grad_grad,
out_grad,
input,
filter,
strides,
paddings_t,
padding_algorithm,
groups,
dilations_t,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search_t,
out_grad_grad,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
}
PD_REGISTER_KERNEL(conv3d_grad_grad,
CPU,
ALL_LAYOUT,
phi::Conv3DGradGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_grad,
CPU,
ALL_LAYOUT,
phi::DepthwiseConvGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(
conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
PD_REGISTER_KERNEL(depthwise_conv2d,
CPU,
ALL_LAYOUT,
phi::DepthwiseConvKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/ddim.h"
namespace phi {
template <typename T = int>
inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
std::vector<T>* dilation,
const std::string padding_algorithm,
const DDim data_dims,
const std::vector<T>& strides,
const std::vector<T>& ksize) {
// set padding size == data_dims.size() * 2
auto data_shape = vectorize<T>(data_dims);
if (static_cast<int>(paddings->size()) == data_dims.size()) {
for (int i = 0; i < data_dims.size(); ++i) {
T copy_pad = *(paddings->begin() + 2 * i);
paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
}
} else {
PADDLE_ENFORCE_EQ(
data_dims.size() * 2,
paddings->size(),
phi::errors::InvalidArgument(
"Attribute padding's size should be the same or twice as the "
"input's dimension. "
"But recieved: padding's size is %d, padding is [%s]; input's "
"dimension is %d, input's shape is [%s].",
paddings->size(),
make_ddim(*paddings),
data_dims.size(),
data_dims));
}
// when padding_algorithm is "VALID" or "SAME"
if (padding_algorithm == "SAME") {
for (int i = 0; i < data_dims.size(); ++i) {
T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
T pad_sum =
std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
static_cast<T>(0));
T pad_0 = pad_sum / 2;
T pad_1 = pad_sum - pad_0;
*(paddings->begin() + i * 2) = pad_0;
*(paddings->begin() + i * 2 + 1) = pad_1;
// dilation
*(dilation->begin() + i) = 1;
}
} else if (padding_algorithm == "VALID") {
for (auto it = paddings->begin(); it != paddings->end(); it++) {
*it = 0;
}
}
}
inline bool IsExpand(const std::vector<int64_t>& filter_dim,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
if (paddings.size() != strides.size()) {
for (size_t j = 0; j < paddings.size(); ++j) {
padding_0 = padding_0 && (paddings[j] == 0);
}
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Tensor = DenseTensor;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
VLOG(5) << "Why am I called?";
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 4, 1, 2, 3};
phi::funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 3, 1, 2};
phi::funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
phi::funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 2, 3, 4, 1};
phi::funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 2, 3, 1};
phi::funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
phi::funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
} // namespace phi
......@@ -15,10 +15,10 @@ limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace phi {
namespace funcs {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& paddding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
ConvGradKernel<T>(dev_ctx,
out_grad,
input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
input_grad,
filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(
conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
PD_REGISTER_KERNEL(
conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void Conv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* input_grad,
DenseTensor* filter_grad) {
const DenseTensor* output_grad = &out_grad;
if (!input_grad && !filter_grad) return;
std::vector<int> strides = strides_t;
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
// update padding and dilation
auto in_dims = input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims;
const paddle::framework::DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
if (data_layout != paddle::framework::DataLayout::kNHWC) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
phi::funcs::SetConstant<Context, T> set_zero;
if (input_grad) {
input_grad->mutable_data<T>(dev_ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (fuse_relu) {
paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx,
input,
filter,
*output_grad,
strides,
paddings,
dilations,
input_grad,
data_layout);
} else {
paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx,
input,
filter,
*output_grad,
strides,
paddings,
dilations,
input_grad,
data_layout);
}
}
if (filter_grad) {
filter_grad->mutable_data<T>(dev_ctx.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
if (fuse_relu) {
paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx,
input,
*output_grad,
strides,
paddings,
dilations,
filter_grad,
data_layout);
} else {
paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx,
input,
*output_grad,
strides,
paddings,
dilations,
filter_grad,
data_layout);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(depthwise_conv2d_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConvKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides_t,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
bool fuse_relu,
DenseTensor* out) {
DenseTensor* output = out;
output->mutable_data<T>(dev_ctx.GetPlace());
const std::vector<int> strides = strides_t;
std::vector<int> dilations = dilations_t;
std::vector<int> paddings = paddings_t;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
if (channel_last) {
PADDLE_ENFORCE_EQ(
output->dims()[output->dims().size() - 1] %
input.dims()[input.dims().size() - 1],
0,
phi::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[output->dims().size() - 1],
input.dims()[input.dims().size() - 1]));
} else {
PADDLE_ENFORCE_EQ(
output->dims()[1] % input.dims()[1],
0,
phi::errors::InvalidArgument(
"ShapeError: The output channels must be a multiple of the "
"input channels. But receivced output channel number is %d "
"and input channel number is %d",
output->dims()[1],
input.dims()[1]));
}
// update padding and dilation
auto in_dims = input.dims();
auto filter_dims = filter.dims();
DDim in_data_dims;
const paddle::framework::DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
if (data_layout != paddle::framework::DataLayout::kNHWC) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
if (!is_sys_pad) {
for (size_t i = 0; i < strides.size(); ++i) {
paddings.erase(paddings.begin() + i + 1);
}
}
if (fuse_relu) {
paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
depthwiseConv;
depthwiseConv(dev_ctx,
input,
filter,
strides,
paddings,
dilations,
output,
data_layout);
} else {
paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
depthwiseConv;
depthwiseConv(dev_ctx,
input,
filter,
strides,
paddings,
dilations,
output,
data_layout);
}
}
} // namespace phi
PD_REGISTER_KERNEL(depthwise_conv2d,
GPU,
ALL_LAYOUT,
phi::DepthwiseConvKernel,
float,
double) {}
此差异已折叠。
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
namespace phi {
template <typename T, typename Context>
void ConvCudnnKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings_t,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_t,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search_t,
DenseTensor* output) {
output->mutable_data<T>(ctx.GetPlace());
std::vector<int> paddings = paddings_t;
std::vector<int> dilations = dilations_t;
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
// HIP MIOPEN ONLY SUPPORT NCHW format
auto compute_format = paddle::platform::DataLayout::kNCHW;
#else
// Tensor Core introduced from Volta GPUs supports more faster conv op
// with FP16 in NHWC data format.
const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
// We will only do data format conversion from NHWC to NCHW.
// cudnn will convert NCHW to NHWC automatically on Tensor Core.
auto compute_format = compute_in_nhwc && channel_last
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
#endif
VLOG(3) << "Compute ConvOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
: "NCHW");
// ------------ transformed tensor -----------
DenseTensor transformed_input_channel(input.type());
DenseTensor transformed_output(output->type());
DenseTensor transformed_filter_channel(filter.type());
T* output_data = nullptr;
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
} else {
transformed_input_channel.ShareDataWith(input);
transformed_output.ShareDataWith(*output);
}
if (compute_format == paddle::platform::DataLayout::kNHWC) {
VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
} else {
transformed_filter_channel.ShareDataWith(filter);
}
output_data = transformed_output.data<T>();
// update padding and dilation
auto in_dims = transformed_input_channel.dims();
auto filter_dims = transformed_filter_channel.dims();
DDim in_data_dims;
DDim filter_data_dims;
if (compute_format == paddle::platform::DataLayout::kNCHW) {
in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
DenseTensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
if (compute_format == paddle::platform::DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == paddle::platform::DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
DDim new_input_shape(make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
transformed_input.mutable_data<T>(ctx.GetPlace());
const int rank = transformed_input_channel.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(ctx,
input_pad,
transformed_input_channel,
pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_input.ShareDataWith(transformed_input_channel);
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = transformed_input.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
padding_common,
dilations,
dtype};
auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
paddle::platform::DataLayout layout =
compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNDHWC
: paddle::platform::DataLayout::kNCDHW;
}
auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
args.handle = handle;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
#else
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn());
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it manually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetConvolutionGroupCount(
args.cdesc.desc(), groups));
groups = 1;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups = 1;
#endif
args.idesc.set(transformed_input, layout_format);
args.wdesc.set(transformed_filter_channel, layout_format, groups);
args.odesc.set(transformed_output, layout_format);
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
} else {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t algo{};
using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args);
algo = search::Find<T>(
args, exhaustive_search, deterministic, workspace_size, ctx);
#else
cudnnConvolutionFwdAlgo_t algo{};
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
workspace_size = search::GetWorkspaceSize(args, algo);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
// FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// FWD_ALGO_IMPLICIT_GEMM manually.
if (groups > 1) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
}
#endif
// ------------------- cudnn conv forward ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
#ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data,
args.wdesc.desc(),
filter_data,
args.cdesc.desc(),
algo,
&beta,
args.odesc.desc(),
output_data,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data + i * group_offset_in,
args.wdesc.desc(),
filter_data + i * group_offset_filter,
args.cdesc.desc(),
algo,
workspace_ptr,
workspace_size,
&beta,
args.odesc.desc(),
output_data + i * group_offset_out));
},
workspace_size);
}
#endif
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
TransToChannelLast<Context, T>(ctx, &transformed_output, output);
}
}
template <typename T, typename Context>
void Conv3DCudnnKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool use_addto,
int workspace_size_MB,
bool exhaustive_search,
DenseTensor* out) {
ConvCudnnKernel<T>(dev_ctx,
input,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search,
out);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
phi::dtype::float16) {}
#else
#if CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else
PD_REGISTER_KERNEL(conv2d,
GPUDNN,
ALL_LAYOUT,
phi::ConvCudnnKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(conv3d,
GPUDNN,
ALL_LAYOUT,
phi::Conv3DCudnnKernel,
float,
double,
phi::dtype::float16) {}
#endif
#endif
// todo register bfloat16
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册