未验证 提交 31f57f29 编写于 作者: Y Yiqun Liu 提交者: GitHub

Move the header file of conv cudnn and miopen to phi directory. (#47248)

上级 a5f556f0
......@@ -26,15 +26,14 @@
#endif
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
......
......@@ -26,7 +26,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
......@@ -34,7 +33,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
......
......@@ -16,10 +16,10 @@ limitations under the License. */
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
DECLARE_int64(cudnn_exhaustive_search_times);
......@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc,
cudnn_output_desc,
output_data,
kNUM_CUDNN_FWD_ALGS,
phi::kNUM_CUDNN_FWD_ALGS,
&find_count,
&find_result,
cudnn_workspace_ptr,
......@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle,
......@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc,
cudnn_conv_desc,
cudnn_output_desc,
kNUM_CUDNN_FWD_ALGS,
phi::kNUM_CUDNN_FWD_ALGS,
&perf_count,
perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo;
......@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
[&]() -> SearchFuseResult<cudnnConvolutionFwdAlgo_t> {
int returned_algo_count;
SearchFuseResult<cudnnConvolutionFwdAlgo_t> fwd_result;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
std::array<cudnnConvolutionFwdAlgoPerf_t, phi::kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc,
cudnn_output_desc,
output_data,
kNUM_CUDNN_FWD_ALGS,
phi::kNUM_CUDNN_FWD_ALGS,
&returned_algo_count,
fwd_perf_stat.data(),
cudnn_workspace,
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace paddle {
namespace operators {
......@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle,
......@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
filter_desc[i],
conv_desc[i],
out_desc[i],
kNUM_CUDNN_FWD_ALGS,
phi::kNUM_CUDNN_FWD_ALGS,
&perf_count,
perf_results.get()));
algo[i] = (perf_results.get())[best_algo_idx].algo;
......
......@@ -14,52 +14,15 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
namespace paddle {
namespace operators {
namespace phi {
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
}
static inline double ToMegaBytes(size_t bytes) {
return static_cast<double>(bytes) / (1 << 20);
}
......@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() {
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) {
int device_id = platform::GetCurrentDeviceId();
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int64_t allocated =
memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = platform::GpuAvailableMemToAlloc();
paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved)
<< " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
......@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
cudnnConvolutionFwdAlgo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
algo,
&workspace_size));
phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
algo,
&workspace_size));
return workspace_size;
}
......@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
......@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
......@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
......@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
args.handle,
args.idesc.desc(),
args.x->data<T>(),
......@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
size_t max_workspace_size = 0;
for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
size_t workspace_size = 0;
auto status =
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
static_cast<cudnnConvolutionFwdAlgo_t>(algo),
&workspace_size);
auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
static_cast<cudnnConvolutionFwdAlgo_t>(algo),
&workspace_size);
if (status == CUDNN_STATUS_SUCCESS &&
workspace_size <= workspace_size_limit) {
max_workspace_size = std::max(workspace_size, max_workspace_size);
......@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
cudnnConvolutionBwdDataAlgo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
......@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
......@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
......@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
......@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle,
args.wdesc.desc(),
args.w->data<T>(),
......@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
size_t workspace_size = 0;
auto status =
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
......@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo) {
platform::CUDAGraphCaptureModeGuard guard;
paddle::platform::CUDAGraphCaptureModeGuard guard;
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
auto workspace_handle = ctx.cudnn_workspace_handle();
if (platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
if (paddle::platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
size_t max_workspace_size =
GetMaxWorkspaceSize(args, workspace_size_limit);
VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
......@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle,
args.idesc.desc(),
args.x->data<T>(),
......@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int max_algos = GetAlgorithmMaxCount(args.handle);
std::vector<PerfT> perf_results(max_algos);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDNN_VERSION_MIN(7, 0, 1)
int max_algos = 0;
auto status =
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
handle, &max_algos);
if (status == gpuSuccess) {
VLOG(5) << "[BackwardFilter] max_algos: predefined="
......@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
size_t workspace_size = 0;
auto status =
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
......@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
bool enable_autotune = true) {
SearchResult<AlgoT> result;
bool use_autotune = false;
auto dtype = platform::CudnnDataType<T>::type;
auto dtype = paddle::platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
......@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
return result;
}
static void SetConvMathType(const phi::GPUContext& ctx,
cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) {
static void SetConvMathType(
const phi::GPUContext& ctx,
cudnnDataType_t dtype,
const paddle::platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
......@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
} else if (ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) {
VLOG(5) << "Enable Tensor Core for BFLOAT16";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
}
#endif
}
};
} // namespace operators
} // namespace paddle
} // namespace phi
......@@ -20,21 +20,19 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace paddle {
namespace operators {
namespace phi {
using Tensor = phi::DenseTensor;
using DataLayout = platform::DataLayout;
using framework::AlgorithmsCache;
using framework::ConvSearchCache;
using GPUDNNDataLayout = paddle::platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
using ScalingParamType =
typename paddle::platform::CudnnDataType<T>::ScalingParamType;
// As the container of searchAlgorithm::Find() result.
template <typename AlgoT>
......@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template <typename HandleT, typename DataT>
struct ConvArgsBase {
HandleT handle;
platform::TensorDescriptor idesc, odesc;
platform::FilterDescriptor wdesc;
platform::ConvolutionDescriptor cdesc;
paddle::platform::TensorDescriptor idesc, odesc;
paddle::platform::FilterDescriptor wdesc;
paddle::platform::ConvolutionDescriptor cdesc;
const phi::DenseTensor *x, *w, *o;
DataT cudnn_dtype;
......@@ -88,7 +86,7 @@ struct ConvArgsBase {
int group;
// data foramt
DataLayout data_layout;
GPUDNNDataLayout data_layout;
ConvArgsBase(const phi::DenseTensor* x,
const phi::DenseTensor* w,
......@@ -98,7 +96,7 @@ struct ConvArgsBase {
const std::vector<int> d,
DataT dtype,
int g,
DataLayout layout)
GPUDNNDataLayout layout)
: x(x),
w(w),
o(o),
......@@ -131,16 +129,16 @@ struct ConvArgsBase {
}
};
static inline void GetNCDHW(const framework::DDim& dims,
const DataLayout& layout,
static inline void GetNCDHW(const phi::DDim& dims,
const GPUDNNDataLayout& layout,
int* N,
int* C,
int* D,
int* H,
int* W) {
*N = dims[0];
*C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == DataLayout::kNCHW ? 0 : 1;
*C = layout == GPUDNNDataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == GPUDNNDataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) {
*D = dims[2 - i];
*H = dims[3 - i];
......@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims,
}
}
} // namespace operators
} // namespace paddle
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
}
} // namespace phi
......@@ -18,15 +18,14 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_int64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
DECLARE_int64(cudnn_exhaustive_search_times);
namespace paddle {
namespace operators {
namespace phi {
#ifdef PADDLE_WITH_HIP
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 1;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 1;
......@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#endif
} // namespace operators
} // namespace paddle
} // namespace phi
......@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
......@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel(
auto layout = paddle::platform::GetCudnnTensorFormat(
paddle::platform::DataLayout::kNCHW);
paddle::operators::ConvArgs args1{&transformed_ddX,
W,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args2{&transformed_X,
ddW,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args3{&transformed_ddX,
dW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
paddle::operators::ConvArgs args4{&transformed_dX,
ddW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args1{&transformed_ddX,
W,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args2{&transformed_X,
ddW,
&transformed_ddO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args3{&transformed_ddX,
dW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args4{&transformed_dX,
ddW,
&transformed_dO_channel,
strides,
padding_common,
dilations,
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
filter_result;
SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
#else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
filter_result;
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
#endif
// ddo = conv(ddI, W) + conv(I, ddW)
......@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel(
c_group);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
fwd_result1.algo = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif
......@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel(
c_group);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_result2.algo = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
......@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel(
c_group);
#ifdef PADDLE_WITH_HIP
using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_result.algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result =
search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max(
......@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel(
c_group);
#ifdef PADDLE_WITH_HIP
using search4 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_result.algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_result =
search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max(
......@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel(
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = W->numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
......@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel(
axes[i] = i;
}
if (X->dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>(
RemovePaddingSlice<Context, T, 4>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
} else {
paddle::operators::RemovePaddingSlice<Context, T, 5>(
RemovePaddingSlice<Context, T, 5>(
ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
}
}
......
......@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
......@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx,
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
paddle::operators::ConvArgs args1{&transformed_input_grad,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype,
groups,
layout};
paddle::operators::ConvArgs args2{&transformed_input,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype,
groups,
layout};
ConvArgs args1{&transformed_input_grad,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype,
groups,
layout};
ConvArgs args2{&transformed_input,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations,
dtype,
groups,
layout};
auto handle = ctx.cudnn_handle();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
......@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx,
int i_n, i_c, i_d, i_h, i_w;
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
} else {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
GetNCDHW(transformed_output_grad_channel.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
......@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx,
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
filter_result;
SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
#else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
filter_result;
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
#endif
// input data workspace_size
size_t workspace_size_d = 0;
......@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size_d, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
#endif
......@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size_w, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result =
search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time "
......@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx,
}
// ------------------- cudnn conv backward data ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> alpha = 1.0f;
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> beta = 0.0f;
#else
paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
#endif
VLOG(4) << "Conv_grad: use_addto = " << use_addto;
......@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx,
ctx.template Alloc<T>(&transformed_input_grad_channel);
if (transformed_input_channel.dims().size() == 4) {
paddle::operators::RemovePaddingSlice<Context, T, 4>(
ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
RemovePaddingSlice<Context, T, 4>(ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
} else {
paddle::operators::RemovePaddingSlice<Context, T, 5>(
ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
RemovePaddingSlice<Context, T, 5>(ctx,
&transformed_input_grad,
&transformed_input_grad_channel,
starts,
axes);
}
}
......@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx,
}
// filter_grad do not use inplace addto.
paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
......
......@@ -19,9 +19,9 @@
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
......@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx,
const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
padding_common,
dilations,
dtype,
groups,
compute_format};
ConvArgs args{&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
padding_common,
dilations,
dtype,
groups,
compute_format};
auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
......@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx,
int o_n, o_c, o_d, o_h, o_w;
if (compute_format == paddle::platform::DataLayout::kNHWC) {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNHWC,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNHWC,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
} else {
paddle::operators::GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
paddle::operators::GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
GetNCDHW(transformed_input.dims(),
paddle::platform::DataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
GetNCDHW(transformed_output.dims(),
paddle::platform::DataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
......@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx,
size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args);
fwd_result.algo = search::Find<T>(
args, exhaustive_search, deterministic, workspace_size, ctx);
#else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size;
#endif
......@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx,
#endif
// ------------------- cudnn conv forward ---------------------
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
......
......@@ -14,48 +14,12 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
namespace paddle {
namespace operators {
namespace phi {
using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>;
template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context,
const phi::DenseTensor* input,
phi::DenseTensor* out,
const std::vector<int>& starts,
const std::vector<int>& axes) {
auto& place = *context.eigen_device();
auto in_dims = input->dims();
auto new_out_dims = out->dims();
auto offsets = Eigen::array<int, D>();
auto extents = Eigen::array<int, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = new_out_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
int start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*input);
auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
out_t.device(place) = in_t.slice(offsets, extents);
}
template <typename PerfT>
struct SearchAlgorithm {};
......@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionForwardAlgorithm(
phi::dynload::miopenFindConvolutionForwardAlgorithm(
args.handle,
args.idesc.desc(),
args.x->data<T>(),
......@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
phi::dynload::miopenConvolutionForwardGetWorkSpaceSize(
args.handle,
args.wdesc.desc(),
args.idesc.desc(),
......@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
phi::dynload::miopenFindConvolutionBackwardDataAlgorithm(
args.handle,
args.odesc.desc(),
args.o->data<T>(),
......@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
phi::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
args.handle,
args.odesc.desc(),
args.wdesc.desc(),
......@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
phi::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
args.handle,
args.odesc.desc(),
args.o->data<T>(),
......@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
phi::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
args.handle,
args.odesc.desc(),
args.idesc.desc(),
......@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
}
};
} // namespace operators
} // namespace paddle
} // namespace phi
......@@ -28,11 +28,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
namespace phi {
......@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto dtype = paddle::platform::CudnnDataType<T>::type;
paddle::operators::ConvArgs args1{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype,
groups,
layout};
paddle::operators::ConvArgs args2{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype,
groups,
layout};
ConvArgs args1{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype,
groups,
layout};
ConvArgs args2{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype,
groups,
layout};
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
filter_result;
SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
#else
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
filter_result;
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
#endif
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
......@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
fwd_result.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
......@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_result.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
......@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
int dout_offset =
transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
if (dx) {
// Because beta is zero, it is unnecessary to reset dx.
......@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto handle = ctx.cudnn_handle();
auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
paddle::operators::ConvArgs args1{&transformed_ddout_channel,
&filter,
&transformed_ddx,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args2{&transformed_ddout_channel,
&ddfilter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args3{&transformed_dout,
dfilter,
&transformed_ddx_channel,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
paddle::operators::ConvArgs args4{&transformed_dout,
&ddfilter,
&transformed_dx_channel,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args1{&transformed_ddout_channel,
&filter,
&transformed_ddx,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args2{&transformed_ddout_channel,
&ddfilter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args3{&transformed_dout,
dfilter,
&transformed_ddx_channel,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args4{&transformed_dout,
&ddfilter,
&transformed_dx_channel,
strides,
padding_common,
dilations_,
dtype,
groups,
GPUDNNDataLayout::kNCHW};
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
filter_result;
paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
#else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
filter_result;
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
#endif
// ddo = conv(ddI, filter) + conv(I, ddfilter)
......@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
bwd_result1.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif
......@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_result2.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
......@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_result.algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
......@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search4 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
fwd_result.algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
......@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
}
int i_n, i_c, i_d, i_h, i_w;
paddle::operators::GetNCDHW(transformed_x.dims(),
GPUDNNDataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
GetNCDHW(transformed_x.dims(),
GPUDNNDataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
int o_n, o_c, o_d, o_h, o_w;
paddle::operators::GetNCDHW(transformed_dout.dims(),
GPUDNNDataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
GetNCDHW(transformed_dout.dims(),
GPUDNNDataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
int group_offset_in =
transformed_x.numel() / transformed_x.dims()[0] / groups;
......@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int group_offset_filter = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto wkspace_handle = ctx.cudnn_workspace_handle();
......
......@@ -26,11 +26,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
namespace phi {
......@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto dtype = paddle::platform::CudnnDataType<T>::type;
// ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_out,
&filter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype,
groups,
data_layout};
ConvArgs args{&transformed_out,
&filter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype,
groups,
data_layout};
args.handle = handle;
args.idesc.set(transformed_out, iwo_groups);
args.wdesc.set(filter, layout_tensor, iwo_groups);
......@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups);
#ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
using search =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
bwd_result.algo =
search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
......@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
......
......@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册