未验证 提交 31f57f29 编写于 作者: Y Yiqun Liu 提交者: GitHub

Move the header file of conv cudnn and miopen to phi directory. (#47248)

上级 a5f556f0
...@@ -26,15 +26,14 @@ ...@@ -26,15 +26,14 @@
#endif #endif
#include <cudnn.h> #include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
#endif #endif
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif #endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h" #include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif #endif
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif #endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif #endif
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -34,7 +33,6 @@ ...@@ -34,7 +33,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif #endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h" #include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif #endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
......
...@@ -16,10 +16,10 @@ limitations under the License. */ ...@@ -16,10 +16,10 @@ limitations under the License. */
#include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h" #include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
DECLARE_int64(cudnn_exhaustive_search_times); DECLARE_int64(cudnn_exhaustive_search_times);
...@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, cudnn_conv_desc,
cudnn_output_desc, cudnn_output_desc,
output_data, output_data,
kNUM_CUDNN_FWD_ALGS, phi::kNUM_CUDNN_FWD_ALGS,
&find_count, &find_count,
&find_result, &find_result,
cudnn_workspace_ptr, cudnn_workspace_ptr,
...@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
int best_algo_idx = 0; int best_algo_idx = 0;
size_t tmp_size = 0; size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results( std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, handle,
...@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, cudnn_filter_desc,
cudnn_conv_desc, cudnn_conv_desc,
cudnn_output_desc, cudnn_output_desc,
kNUM_CUDNN_FWD_ALGS, phi::kNUM_CUDNN_FWD_ALGS,
&perf_count, &perf_count,
perf_results.get())); perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo; algo = (perf_results.get())[best_algo_idx].algo;
...@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
[&]() -> SearchFuseResult<cudnnConvolutionFwdAlgo_t> { [&]() -> SearchFuseResult<cudnnConvolutionFwdAlgo_t> {
int returned_algo_count; int returned_algo_count;
SearchFuseResult<cudnnConvolutionFwdAlgo_t> fwd_result; SearchFuseResult<cudnnConvolutionFwdAlgo_t> fwd_result;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, phi::kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) { auto cudnn_find_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
...@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, cudnn_conv_desc,
cudnn_output_desc, cudnn_output_desc,
output_data, output_data,
kNUM_CUDNN_FWD_ALGS, phi::kNUM_CUDNN_FWD_ALGS,
&returned_algo_count, &returned_algo_count,
fwd_perf_stat.data(), fwd_perf_stat.data(),
cudnn_workspace, cudnn_workspace,
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
int best_algo_idx = 0; int best_algo_idx = 0;
size_t tmp_size = 0; size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results( std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, handle,
...@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
filter_desc[i], filter_desc[i],
conv_desc[i], conv_desc[i],
out_desc[i], out_desc[i],
kNUM_CUDNN_FWD_ALGS, phi::kNUM_CUDNN_FWD_ALGS,
&perf_count, &perf_count,
perf_results.get())); perf_results.get()));
algo[i] = (perf_results.get())[best_algo_idx].algo; algo[i] = (perf_results.get())[best_algo_idx].algo;
......
...@@ -14,52 +14,15 @@ limitations under the License. */ ...@@ -14,52 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace paddle { namespace phi {
namespace operators {
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>; using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
}
static inline double ToMegaBytes(size_t bytes) { static inline double ToMegaBytes(size_t bytes) {
return static_cast<double>(bytes) / (1 << 20); return static_cast<double>(bytes) / (1 << 20);
} }
...@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() { ...@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() {
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) { if (!use_fixed_workspace) {
int device_id = platform::GetCurrentDeviceId(); int device_id = phi::backends::gpu::GetCurrentDeviceId();
int64_t allocated = int64_t allocated =
memory::DeviceMemoryStatCurrentValue("Allocated", device_id); paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved = int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", device_id); paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = platform::GpuAvailableMemToAlloc(); int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved) << " MB, reserved=" << ToMegaBytes(reserved)
<< " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
...@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
cudnnConvolutionFwdAlgo_t algo) { cudnnConvolutionFwdAlgo_t algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
args.handle, args.idesc.desc(),
args.idesc.desc(), args.wdesc.desc(),
args.wdesc.desc(), args.cdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
args.odesc.desc(), algo,
algo, &workspace_size));
&workspace_size));
return workspace_size; return workspace_size;
} }
...@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
int best_algo_idx = 0; int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS); std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.wdesc.desc(), args.wdesc.desc(),
...@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit(" << result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm( phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.wdesc.desc(), args.wdesc.desc(),
...@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
} }
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm( phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.wdesc.desc(), args.wdesc.desc(),
...@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS); std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) { auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.x->data<T>(), args.x->data<T>(),
...@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
size_t max_workspace_size = 0; size_t max_workspace_size = 0;
for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
auto status = auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( args.handle,
args.handle, args.idesc.desc(),
args.idesc.desc(), args.wdesc.desc(),
args.wdesc.desc(), args.cdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
args.odesc.desc(), static_cast<cudnnConvolutionFwdAlgo_t>(algo),
static_cast<cudnnConvolutionFwdAlgo_t>(algo), &workspace_size);
&workspace_size);
if (status == CUDNN_STATUS_SUCCESS && if (status == CUDNN_STATUS_SUCCESS &&
workspace_size <= workspace_size_limit) { workspace_size <= workspace_size_limit) {
max_workspace_size = std::max(workspace_size, max_workspace_size); max_workspace_size = std::max(workspace_size, max_workspace_size);
...@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
cudnnConvolutionBwdDataAlgo_t algo) { cudnnConvolutionBwdDataAlgo_t algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
int best_algo_idx = 0; int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS); std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit(" << result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
} }
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS); std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) { auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx( phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.w->data<T>(), args.w->data<T>(),
...@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
auto status = auto status =
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
static size_t GetWorkspaceSize(const ConvArgs& args, static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo) { cudnnConvolutionBwdFilterAlgo_t algo) {
platform::CUDAGraphCaptureModeGuard guard; paddle::platform::CUDAGraphCaptureModeGuard guard;
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int best_algo_idx = 0; int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit(" << result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
} }
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
size_t workspace_size_limit = size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace()); CalcWorkspaceLimitInBytes(UseFixedWorkspace());
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
if (platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) { if (paddle::platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
size_t max_workspace_size = size_t max_workspace_size =
GetMaxWorkspaceSize(args, workspace_size_limit); GetMaxWorkspaceSize(args, workspace_size_limit);
VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
...@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
auto cudnn_find_func = [&](void* workspace_ptr) { auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx( phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.x->data<T>(), args.x->data<T>(),
...@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int max_algos = GetAlgorithmMaxCount(args.handle); int max_algos = GetAlgorithmMaxCount(args.handle);
std::vector<PerfT> perf_results(max_algos); std::vector<PerfT> perf_results(max_algos);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDNN_VERSION_MIN(7, 0, 1) #if CUDNN_VERSION_MIN(7, 0, 1)
int max_algos = 0; int max_algos = 0;
auto status = auto status =
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
handle, &max_algos); handle, &max_algos);
if (status == gpuSuccess) { if (status == gpuSuccess) {
VLOG(5) << "[BackwardFilter] max_algos: predefined=" VLOG(5) << "[BackwardFilter] max_algos: predefined="
...@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
auto status = auto status =
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.odesc.desc(), args.odesc.desc(),
...@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
bool enable_autotune = true) { bool enable_autotune = true) {
SearchResult<AlgoT> result; SearchResult<AlgoT> result;
bool use_autotune = false; bool use_autotune = false;
auto dtype = platform::CudnnDataType<T>::type; auto dtype = paddle::platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc); SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) { if (deterministic) {
...@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
return result; return result;
} }
static void SetConvMathType(const phi::GPUContext& ctx, static void SetConvMathType(
cudnnDataType_t dtype, const phi::GPUContext& ctx,
const platform::ConvolutionDescriptor& cdesc) { cudnnDataType_t dtype,
const paddle::platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH)); cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "Enable Tensor Core for FLOAT16"; VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000 #if CUDA_VERSION >= 11000
...@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
} else if (ctx.GetComputeCapability() >= 80 && } else if (ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) { dtype == CUDNN_DATA_BFLOAT16) {
VLOG(5) << "Enable Tensor Core for BFLOAT16"; VLOG(5) << "Enable Tensor Core for BFLOAT16";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH)); cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0) #endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH)); cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000 #endif // CUDA_VERSION >= 11000
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH)); cdesc.desc(), CUDNN_DEFAULT_MATH));
} }
#endif #endif
} }
}; };
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -20,21 +20,19 @@ limitations under the License. */ ...@@ -20,21 +20,19 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace paddle { namespace phi {
namespace operators {
using Tensor = phi::DenseTensor; using GPUDNNDataLayout = paddle::platform::DataLayout;
using DataLayout = platform::DataLayout;
using framework::AlgorithmsCache;
using framework::ConvSearchCache;
template <typename T> template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType; using ScalingParamType =
typename paddle::platform::CudnnDataType<T>::ScalingParamType;
// As the container of searchAlgorithm::Find() result. // As the container of searchAlgorithm::Find() result.
template <typename AlgoT> template <typename AlgoT>
...@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) { ...@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template <typename HandleT, typename DataT> template <typename HandleT, typename DataT>
struct ConvArgsBase { struct ConvArgsBase {
HandleT handle; HandleT handle;
platform::TensorDescriptor idesc, odesc; paddle::platform::TensorDescriptor idesc, odesc;
platform::FilterDescriptor wdesc; paddle::platform::FilterDescriptor wdesc;
platform::ConvolutionDescriptor cdesc; paddle::platform::ConvolutionDescriptor cdesc;
const phi::DenseTensor *x, *w, *o; const phi::DenseTensor *x, *w, *o;
DataT cudnn_dtype; DataT cudnn_dtype;
...@@ -88,7 +86,7 @@ struct ConvArgsBase { ...@@ -88,7 +86,7 @@ struct ConvArgsBase {
int group; int group;
// data foramt // data foramt
DataLayout data_layout; GPUDNNDataLayout data_layout;
ConvArgsBase(const phi::DenseTensor* x, ConvArgsBase(const phi::DenseTensor* x,
const phi::DenseTensor* w, const phi::DenseTensor* w,
...@@ -98,7 +96,7 @@ struct ConvArgsBase { ...@@ -98,7 +96,7 @@ struct ConvArgsBase {
const std::vector<int> d, const std::vector<int> d,
DataT dtype, DataT dtype,
int g, int g,
DataLayout layout) GPUDNNDataLayout layout)
: x(x), : x(x),
w(w), w(w),
o(o), o(o),
...@@ -131,16 +129,16 @@ struct ConvArgsBase { ...@@ -131,16 +129,16 @@ struct ConvArgsBase {
} }
}; };
static inline void GetNCDHW(const framework::DDim& dims, static inline void GetNCDHW(const phi::DDim& dims,
const DataLayout& layout, const GPUDNNDataLayout& layout,
int* N, int* N,
int* C, int* C,
int* D, int* D,
int* H, int* H,
int* W) { int* W) {
*N = dims[0]; *N = dims[0];
*C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; *C = layout == GPUDNNDataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == DataLayout::kNCHW ? 0 : 1; int i = layout == GPUDNNDataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) { if (dims.size() == 5) {
*D = dims[2 - i]; *D = dims[2 - i];
*H = dims[3 - i]; *H = dims[3 - i];
...@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims, ...@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims,
} }
} }
} // namespace operators template <typename DeviceContext, typename T, size_t D>
} // namespace paddle static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
}
} // namespace phi
...@@ -18,15 +18,14 @@ limitations under the License. */ ...@@ -18,15 +18,14 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_int64(conv_workspace_size_limit); DECLARE_int64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search); DECLARE_bool(cudnn_exhaustive_search);
DECLARE_int64(cudnn_exhaustive_search_times); DECLARE_int64(cudnn_exhaustive_search_times);
namespace paddle { namespace phi {
namespace operators {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 1; static constexpr size_t kNUM_CUDNN_FWD_ALGS = 1;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 1; static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 1;
...@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = ...@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#endif #endif
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
...@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel( ...@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel(
auto layout = paddle::platform::GetCudnnTensorFormat( auto layout = paddle::platform::GetCudnnTensorFormat(
paddle::platform::DataLayout::kNCHW); paddle::platform::DataLayout::kNCHW);
paddle::operators::ConvArgs args1{&transformed_ddX, ConvArgs args1{&transformed_ddX,
W, W,
&transformed_ddO_channel, &transformed_ddO_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
paddle::platform::DataLayout::kNCHW}; paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args2{&transformed_X, ConvArgs args2{&transformed_X,
ddW, ddW,
&transformed_ddO_channel, &transformed_ddO_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
paddle::platform::DataLayout::kNCHW}; paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args3{&transformed_ddX, ConvArgs args3{&transformed_ddX,
dW, dW,
&transformed_dO_channel, &transformed_dO_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
paddle::platform::DataLayout::kNCHW}; paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args4{&transformed_dX, ConvArgs args4{&transformed_dX,
ddW, ddW,
&transformed_dO_channel, &transformed_dO_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
paddle::platform::DataLayout::kNCHW}; paddle::platform::DataLayout::kNCHW};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1; SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result2; SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> data_result; SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t> SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
filter_result;
#else #else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result; SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t> SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
filter_result;
#endif #endif
// ddo = conv(ddI, W) + conv(I, ddW) // ddo = conv(ddI, W) + conv(I, ddW)
...@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel( ...@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel(
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1); workspace_size = search1::GetWorkspaceSize(args1);
fwd_result1.algo = search1::Find<T>( fwd_result1.algo = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx); args1, exhaustive_search, false, workspace_size, ctx);
#else #else
using search1 = using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false); fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif #endif
...@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel( ...@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel(
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2)); std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_result2.algo = search2::Find<T>( fwd_result2.algo = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx); args2, exhaustive_search, false, workspace_size, ctx);
#else #else
using search2 = using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false); fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
...@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel( ...@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel(
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search3 = using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_result.algo = search3::Find<T>( filter_result.algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx); args3, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search3 = using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = filter_result =
search3::Find<T>(ctx, args3, exhaustive_search, deterministic); search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
...@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel( ...@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel(
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search4 = using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_result.algo = search4::Find<T>( data_result.algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx); args4, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search4 = using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_result = data_result =
search4::Find<T>(ctx, args4, exhaustive_search, deterministic); search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
...@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel( ...@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel(
int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups; int group_offset_filter = W->numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
...@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel( ...@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel(
axes[i] = i; axes[i] = i;
} }
if (X->dims().size() == 4) { if (X->dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>( RemovePaddingSlice<Context, T, 4>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes); ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
} else { } else {
paddle::operators::RemovePaddingSlice<Context, T, 5>( RemovePaddingSlice<Context, T, 5>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes); ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
} }
} }
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
...@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx,
? paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW; : paddle::platform::DataLayout::kNCHW;
paddle::operators::ConvArgs args1{&transformed_input_grad, ConvArgs args1{&transformed_input_grad,
&transformed_filter_channel, &transformed_filter_channel,
&transformed_output_grad_channel, &transformed_output_grad_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
layout}; layout};
paddle::operators::ConvArgs args2{&transformed_input, ConvArgs args2{&transformed_input,
&transformed_filter_grad_channel, &transformed_filter_grad_channel,
&transformed_output_grad_channel, &transformed_output_grad_channel,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
layout}; layout};
auto handle = ctx.cudnn_handle(); auto handle = ctx.cudnn_handle();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
...@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx,
int i_n, i_c, i_d, i_h, i_w; int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w; int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) { if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(), GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC, paddle::platform::DataLayout::kNHWC,
&i_n, &i_n,
&i_c, &i_c,
&i_d, &i_d,
&i_h, &i_h,
&i_w); &i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNHWC, paddle::platform::DataLayout::kNHWC,
&o_n, &o_n,
&o_c, &o_c,
&o_d, &o_d,
&o_h, &o_h,
&o_w); &o_w);
} else { } else {
paddle::operators::GetNCDHW(transformed_input.dims(), GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW, paddle::platform::DataLayout::kNCHW,
&i_n, &i_n,
&i_c, &i_c,
&i_d, &i_d,
&i_h, &i_h,
&i_w); &i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNCHW, paddle::platform::DataLayout::kNCHW,
&o_n, &o_n,
&o_c, &o_c,
&o_d, &o_d,
&o_h, &o_h,
&o_w); &o_w);
} }
int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_in = i_c / groups * i_h * i_w * i_d;
...@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx,
// ------------------- cudnn backward algorithm --------------------- // ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result; SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t> SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
filter_result;
#else #else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result; SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t> SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
filter_result;
#endif #endif
// input data workspace_size // input data workspace_size
size_t workspace_size_d = 0; size_t workspace_size_d = 0;
...@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d = workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
bwd_result.algo = search1::Find<T>( bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size_d, ctx); args1, exhaustive_search, deterministic, workspace_size_d, ctx);
#else #else
using search1 = using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic); bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
#endif #endif
...@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w = workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_result.algo = search2::Find<T>( filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size_w, ctx); args2, exhaustive_search, deterministic, workspace_size_w, ctx);
#else #else
using search2 = using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = filter_result =
search2::Find<T>(ctx, args2, exhaustive_search, deterministic); search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time " VLOG(3) << "filter algo: " << filter_result.algo << ", time "
...@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx,
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f // MIOPEN ONLY support beta to be 0.0f
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
#else #else
paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f; ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
#endif #endif
VLOG(4) << "Conv_grad: use_addto = " << use_addto; VLOG(4) << "Conv_grad: use_addto = " << use_addto;
...@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx,
ctx.template Alloc<T>(&transformed_input_grad_channel); ctx.template Alloc<T>(&transformed_input_grad_channel);
if (transformed_input_channel.dims().size() == 4) { if (transformed_input_channel.dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>( RemovePaddingSlice<Context, T, 4>(ctx,
ctx, &transformed_input_grad,
&transformed_input_grad, &transformed_input_grad_channel,
&transformed_input_grad_channel, starts,
starts, axes);
axes);
} else { } else {
paddle::operators::RemovePaddingSlice<Context, T, 5>( RemovePaddingSlice<Context, T, 5>(ctx,
ctx, &transformed_input_grad,
&transformed_input_grad, &transformed_input_grad_channel,
&transformed_input_grad_channel, starts,
starts, axes);
axes);
} }
} }
...@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx,
} }
// filter_grad do not use inplace addto. // filter_grad do not use inplace addto.
paddle::operators::ScalingParamType<T> beta_filter = 0.0f; ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
...@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx,
const T* filter_data = transformed_filter_channel.data<T>(); const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_input, ConvArgs args{&transformed_input,
&transformed_filter_channel, &transformed_filter_channel,
&transformed_output, &transformed_output,
strides, strides,
padding_common, padding_common,
dilations, dilations,
dtype, dtype,
groups, groups,
compute_format}; compute_format};
auto handle = ctx.cudnn_handle(); auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
...@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx,
int o_n, o_c, o_d, o_h, o_w; int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) { if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(), GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC, paddle::platform::DataLayout::kNHWC,
&i_n, &i_n,
&i_c, &i_c,
&i_d, &i_d,
&i_h, &i_h,
&i_w); &i_w);
paddle::operators::GetNCDHW(transformed_output.dims(), GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNHWC, paddle::platform::DataLayout::kNHWC,
&o_n, &o_n,
&o_c, &o_c,
&o_d, &o_d,
&o_h, &o_h,
&o_w); &o_w);
} else { } else {
paddle::operators::GetNCDHW(transformed_input.dims(), GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW, paddle::platform::DataLayout::kNCHW,
&i_n, &i_n,
&i_c, &i_c,
&i_d, &i_d,
&i_h, &i_h,
&i_w); &i_w);
paddle::operators::GetNCDHW(transformed_output.dims(), GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNCHW, paddle::platform::DataLayout::kNCHW,
&o_n, &o_n,
&o_c, &o_c,
&o_d, &o_d,
&o_h, &o_h,
&o_w); &o_w);
} }
int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_in = i_c / groups * i_h * i_w * i_d;
...@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx,
size_t workspace_size = 0; // final workspace to allocate. size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result; SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args); workspace_size = search::GetWorkspaceSize(args);
fwd_result.algo = search::Find<T>( fwd_result.algo = search::Find<T>(
args, exhaustive_search, deterministic, workspace_size, ctx); args, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic); fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size; workspace_size = fwd_result.workspace_size;
#endif #endif
...@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx,
#endif #endif
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f; // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
......
...@@ -14,48 +14,12 @@ limitations under the License. */ ...@@ -14,48 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
#include "paddle/fluid/operators/conv_base_helper.h"
namespace paddle { namespace phi {
namespace operators {
using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>; using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>;
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::array<int, D>();
auto extents = Eigen::array<int, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*input);
auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
out_t.device(place) = in_t.slice(offsets, extents);
}
template <typename PerfT> template <typename PerfT>
struct SearchAlgorithm {}; struct SearchAlgorithm {};
...@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { ...@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionForwardAlgorithm( phi::dynload::miopenFindConvolutionForwardAlgorithm(
args.handle, args.handle,
args.idesc.desc(), args.idesc.desc(),
args.x->data<T>(), args.x->data<T>(),
...@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { ...@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( phi::dynload::miopenConvolutionForwardGetWorkSpaceSize(
args.handle, args.handle,
args.wdesc.desc(), args.wdesc.desc(),
args.idesc.desc(), args.idesc.desc(),
...@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> { ...@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( phi::dynload::miopenFindConvolutionBackwardDataAlgorithm(
args.handle, args.handle,
args.odesc.desc(), args.odesc.desc(),
args.o->data<T>(), args.o->data<T>(),
...@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> { ...@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( phi::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
args.handle, args.handle,
args.odesc.desc(), args.odesc.desc(),
args.wdesc.desc(), args.wdesc.desc(),
...@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> { ...@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
miopenConvAlgoPerf_t find_result; miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( phi::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
args.handle, args.handle,
args.odesc.desc(), args.odesc.desc(),
args.o->data<T>(), args.o->data<T>(),
...@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> { ...@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( phi::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
args.handle, args.handle,
args.odesc.desc(), args.odesc.desc(),
args.idesc.desc(), args.idesc.desc(),
...@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> { ...@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
} }
}; };
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -28,11 +28,11 @@ limitations under the License. */ ...@@ -28,11 +28,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
namespace phi { namespace phi {
...@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto dtype = paddle::platform::CudnnDataType<T>::type; auto dtype = paddle::platform::CudnnDataType<T>::type;
paddle::operators::ConvArgs args1{&transformed_dout, ConvArgs args1{&transformed_dout,
&filter, &filter,
&x_transpose, &x_transpose,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
layout}; layout};
paddle::operators::ConvArgs args2{&transformed_dout, ConvArgs args2{&transformed_dout,
&filter, &filter,
&x_transpose, &x_transpose,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
layout}; layout};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result; SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t> SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
filter_result;
#else #else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t> SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
filter_result;
#endif #endif
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
...@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
fwd_result.algo = fwd_result.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false); fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
...@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_result.algo = filter_result.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search2::Find<T>(ctx, args2, false, deterministic, false); filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
...@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
int dout_offset = int dout_offset =
transformed_dout.numel() / transformed_dout.dims()[0] / groups; transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int filter_offset = filter.numel() / groups; int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
if (dx) { if (dx) {
// Because beta is zero, it is unnecessary to reset dx. // Because beta is zero, it is unnecessary to reset dx.
...@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto handle = ctx.cudnn_handle(); auto handle = ctx.cudnn_handle();
auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
paddle::operators::ConvArgs args1{&transformed_ddout_channel, ConvArgs args1{&transformed_ddout_channel,
&filter, &filter,
&transformed_ddx, &transformed_ddx,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
GPUDNNDataLayout::kNCHW}; GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args2{&transformed_ddout_channel, ConvArgs args2{&transformed_ddout_channel,
&ddfilter, &ddfilter,
&transformed_x, &transformed_x,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
GPUDNNDataLayout::kNCHW}; GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args3{&transformed_dout, ConvArgs args3{&transformed_dout,
dfilter, dfilter,
&transformed_ddx_channel, &transformed_ddx_channel,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
GPUDNNDataLayout::kNCHW}; GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args4{&transformed_dout, ConvArgs args4{&transformed_dout,
&ddfilter, &ddfilter,
&transformed_dx_channel, &transformed_dx_channel,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
GPUDNNDataLayout::kNCHW}; GPUDNNDataLayout::kNCHW};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1; SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2; SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t> SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
filter_result; SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
#else #else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1; SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2; SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t> SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
filter_result; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
#endif #endif
// ddo = conv(ddI, filter) + conv(I, ddfilter) // ddo = conv(ddI, filter) + conv(I, ddfilter)
...@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1); workspace_size = search1::GetWorkspaceSize(args1);
bwd_result1.algo = bwd_result1.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false); bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif #endif
...@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_result2.algo = bwd_result2.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false); bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
...@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search3 = using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_result.algo = filter_result.algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx); search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else #else
using search3 = using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search3::Find<T>(ctx, args3, false, deterministic, false); filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
...@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(), paddle::platform::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search4 = using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
fwd_result.algo = fwd_result.algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx); search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else #else
using search4 = using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false); fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
...@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
} }
int i_n, i_c, i_d, i_h, i_w; int i_n, i_c, i_d, i_h, i_w;
paddle::operators::GetNCDHW(transformed_x.dims(), GetNCDHW(transformed_x.dims(),
GPUDNNDataLayout::kNCHW, GPUDNNDataLayout::kNCHW,
&i_n, &i_n,
&i_c, &i_c,
&i_d, &i_d,
&i_h, &i_h,
&i_w); &i_w);
int o_n, o_c, o_d, o_h, o_w; int o_n, o_c, o_d, o_h, o_w;
paddle::operators::GetNCDHW(transformed_dout.dims(), GetNCDHW(transformed_dout.dims(),
GPUDNNDataLayout::kNCHW, GPUDNNDataLayout::kNCHW,
&o_n, &o_n,
&o_c, &o_c,
&o_d, &o_d,
&o_h, &o_h,
&o_w); &o_w);
int group_offset_in = int group_offset_in =
transformed_x.numel() / transformed_x.dims()[0] / groups; transformed_x.numel() / transformed_x.dims()[0] / groups;
...@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
transformed_dout.numel() / transformed_dout.dims()[0] / groups; transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int group_offset_filter = filter.numel() / groups; int group_offset_filter = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
auto wkspace_handle = ctx.cudnn_workspace_handle(); auto wkspace_handle = ctx.cudnn_workspace_handle();
......
...@@ -26,11 +26,11 @@ limitations under the License. */ ...@@ -26,11 +26,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
namespace phi { namespace phi {
...@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto dtype = paddle::platform::CudnnDataType<T>::type; auto dtype = paddle::platform::CudnnDataType<T>::type;
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_out, ConvArgs args{&transformed_out,
&filter, &filter,
&transformed_x, &transformed_x,
strides, strides,
padding_common, padding_common,
dilations_, dilations_,
dtype, dtype,
groups, groups,
data_layout}; data_layout};
args.handle = handle; args.handle = handle;
args.idesc.set(transformed_out, iwo_groups); args.idesc.set(transformed_out, iwo_groups);
args.wdesc.set(filter, layout_tensor, iwo_groups); args.wdesc.set(filter, layout_tensor, iwo_groups);
...@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result; SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
using search = using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
bwd_result.algo = bwd_result.algo =
search::Find<T>(args, false, deterministic, workspace_size, ctx); search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else #else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result; SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search::Find<T>(ctx, args, false, deterministic, false); bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
...@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups; int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups; int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
int filter_offset = filter.numel() / groups; int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else #else
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif #endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册