From 1623f1b4f3b23cac46c0b4e8ceacfda5697d9ec0 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 24 Nov 2022 12:03:10 +0800 Subject: [PATCH] [Phi Support CuDNN] Support ALL CuDNN (#47865) * support default use_gpudnn=True * fully support cudnn in phi * add header file * add white_list, verify accuracy * phi support all cudnn * opt affine_grad * try different arches of pretrained_model * try different arches of pretrained_model * add debug string * debug eager_method * add debug string, pass all local ctest * polish all debug code * delete use_cudnn relevant code autogen * fix depthwise_conv2d * Share all other members of Tensor except use_cudnn * polish codes according to review opinion * polish codes according to review opinion, fix bug * polish codes according to review opinion, opt performance * polish codes according to review opinion, fix pooling.py --- paddle/fluid/pybind/eager_method.cc | 42 +++++++++++++++++++ paddle/phi/api/lib/kernel_dispatch.cc | 21 ++++++---- paddle/phi/api/lib/kernel_dispatch.h | 13 +++++- paddle/phi/api/yaml/generator/api_base.py | 12 +----- paddle/phi/api/yaml/legacy_backward.yaml | 31 +++++--------- paddle/phi/api/yaml/legacy_ops.yaml | 15 ++----- paddle/phi/core/dense_tensor.cc | 3 +- paddle/phi/core/dense_tensor_impl.cc | 1 + paddle/phi/core/kernel_factory.cc | 11 +++-- paddle/phi/core/kernel_factory.h | 3 +- paddle/phi/core/tensor_meta.cc | 14 +++++-- paddle/phi/core/tensor_meta.h | 12 ++++-- python/paddle/fluid/dygraph/nn.py | 2 +- .../fluid/dygraph/varbase_patch_methods.py | 5 +++ python/paddle/fluid/layers/nn.py | 2 +- .../tests/unittests/test_egr_python_api.py | 15 +++++++ python/paddle/nn/functional/pooling.py | 12 ++---- python/paddle/nn/functional/vision.py | 3 +- 18 files changed, 134 insertions(+), 83 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index a3e7f43faed..0610a51d4cc 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -1444,6 +1445,43 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor__use_cudnn(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.defined() && self->tensor.is_dense_tensor(), + paddle::platform::errors::Fatal( + "function _use_cudnn is only effective for DenseTensor")); + + bool use_cudnn = pybind::CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); + + // Set the same use_cudnn attribute, return directly + phi::DenseTensor* dense_tensor = + static_cast(self->tensor.impl().get()); + phi::DenseTensorMeta* dense_tensor_meta = + phi::DenseTensorUtils::GetMutableMeta(dense_tensor); + if (use_cudnn == dense_tensor_meta->use_cudnn) { + return ToPyObject(self->tensor); + } + + // Share all other members of Tensor except use_cudnn + phi::DenseTensorMeta target_dense_meta = *dense_tensor_meta; + target_dense_meta.use_cudnn = use_cudnn; + phi::DenseTensor target_dense_tensor; + target_dense_tensor.ShareDataWith(*dense_tensor); + target_dense_tensor.set_meta(target_dense_meta); + // Construct returned tensor + paddle::experimental::Tensor target_tensor( + std::make_shared(target_dense_tensor), + self->tensor.name()); + target_tensor.set_autograd_meta(self->tensor.mutable_autograd_meta()); + VLOG(4) << "Tensor: " << target_tensor.name() + << " set use_cudnn = " << use_cudnn; + + return ToPyObject(target_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_set_vocab(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -2010,6 +2048,10 @@ PyMethodDef variable_methods[] = { (PyCFunction)(void (*)(void))tensor__copy_gradient_from, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_tensor_use_cudnn", + (PyCFunction)(void (*)(void))tensor__use_cudnn, + METH_VARARGS | METH_KEYWORDS, + NULL}, /** the methods to adapt old dygraph, will be removed in the future **/ {"set_string_list", (PyCFunction)(void (*)(void))tensor_method_set_string_list, diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index ccf070c7249..941bc880b99 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -54,14 +54,11 @@ bool HasAllocation(const phi::TensorBase& t) { BackendSet GetTensorBackendSet(const phi::TensorBase& t) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { - BackendSet backend_set(phi::TransToPhiBackend(t.place())); - switch (t.layout()) { - case DataLayout::ONEDNN: - backend_set = backend_set | BackendSet(Backend::ONEDNN); - break; - default: - // do nothing - break; + phi::Backend backend_key = phi::TransToPhiBackend(t.place()); + BackendSet backend_set(backend_key); + if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) && + static_cast(t).meta().use_cudnn) { + backend_set = backend_set | BackendSet(Backend::GPUDNN); } return backend_set; } @@ -126,7 +123,13 @@ Backend ParseBackend(const Place& place) { return phi::TransToPhiBackend(place); } Backend ParseBackend(const Tensor& tensor) { - return phi::TransToPhiBackend(tensor.place()); + Backend backend_key = phi::TransToPhiBackend(tensor.place()); + if (backend_key == Backend::GPU && + phi::DenseTensor::classof(tensor.impl().get()) && + static_cast(tensor.impl().get())->meta().use_cudnn) { + return Backend::GPUDNN; + } + return backend_key; } Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) { diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index 176713b71bb..bfe8eba2444 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -90,6 +90,7 @@ struct ArgsIterator { struct KernelKeyParser : ArgsIterator { KernelKeySet key_set; + bool disable_cudnn = false; // this dtype_set is used for cache multi-inputs dtype and used for // data_promote DataTypeSet dtype_set{DataType::UNDEFINED}; @@ -97,11 +98,19 @@ struct KernelKeyParser : ArgsIterator { // TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): add global device guard method to set backend inline void AssignKernelKeySet(const phi::TensorBase& tensor) { - key_set.backend_set = - key_set.backend_set | detail::GetTensorBackendSet(tensor); + // assign Backend + BackendSet tensor_backend_set = detail::GetTensorBackendSet(tensor); + key_set.backend_set = key_set.backend_set | tensor_backend_set; + // tensor's attribute use_cudnn=False, explicitly disable cudnn kernel + if (tensor_backend_set == BackendSet(Backend::GPU) || disable_cudnn) { + disable_cudnn = true; + key_set.backend_set = key_set.backend_set - BackendSet(Backend::GPUDNN); + } + // assign DataLayout phi::DataLayout tensor_layout = tensor.layout(); key_set.layout = tensor_layout > key_set.layout ? tensor_layout : key_set.layout; + // assign DataType key_set.dtype = tensor.dtype(); dtype_set = dtype_set | DataTypeSet(key_set.dtype); auto promote_result = PromoteTypes(dtype_set); diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index 696ad8736b9..3ad68e6d1d0 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -307,7 +307,6 @@ class BaseAPI: 'backend': None, 'layout': None, 'data_type': None, - 'use_gpudnn': 'false', 'dispatch': {}, } if 'backend' in kernel_config and len(kernel_config['backend']) > 0: @@ -318,10 +317,6 @@ class BaseAPI: kernel['data_type'] = kernel_config['data_type'] if 'param' in kernel_config: kernel['param'] = kernel_config['param'] - if 'use_gpudnn' in kernel_config: - kernel['use_gpudnn'] = kernel_config['use_gpudnn'] - if isinstance(kernel['use_gpudnn'], bool): - kernel['use_gpudnn'] = str(kernel['use_gpudnn']).lower() kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall( kernel_config['func'] ) @@ -1124,15 +1119,10 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d for kernel_out in outputs_args: fallback_kernel_output_trans += f""" {code_indent} TransDataBackend({kernel_out}, kernel_backend, {kernel_out});""" - cudnn_args = ( - '' - if self.kernel['use_gpudnn'] == 'false' - else ', ' + self.kernel['use_gpudnn'] - ) return f""" {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( -{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}{cudnn_args}); +{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} const auto& kernel = kernel_result.kernel; {code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel; {code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index a61aa52cc82..dc542a9964f 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -67,8 +67,8 @@ func : addmm_grad - backward_op : affine_grid_grad - forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) -> Tensor(output) - args : (Tensor output_grad, IntArray outputShape, bool use_cudnn=true, bool align_corners=true) + forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true) -> Tensor(output) + args : (Tensor input, Tensor output_grad, IntArray outputShape, bool align_corners=true) output : Tensor(input_grad) infer_meta : func : AffineGridGradInferMeta @@ -76,7 +76,7 @@ kernel : func : affine_grid_grad param : [output_grad, outputShape, align_corners] - use_gpudnn: use_cudnn + no_need_buffer : input - backward_op : amax_grad forward: amax (Tensor x, int64_t[] axis={}, bool keepdim=false) -> Tensor(out) @@ -262,7 +262,6 @@ param : [input, filter] kernel : func : conv2d_grad - use_gpudnn : true backward : conv2d_grad_grad - backward_op : conv2d_grad_grad @@ -274,7 +273,6 @@ param: [input, filter, grad_out] kernel : func : conv2d_grad_grad - use_gpudnn : true optional : grad_input_grad, grad_filter_grad - backward_op : conv2d_transpose_double_grad @@ -285,7 +283,6 @@ func : Conv2dTransposeDoubleGradInferMeta kernel : func : conv2d_transpose_grad_grad - use_gpudnn : true - backward_op : conv2d_transpose_grad forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) @@ -295,7 +292,6 @@ func : Conv2dTransposeGradInferMeta kernel : func : conv2d_transpose_grad - use_gpudnn : true backward : conv2d_transpose_double_grad - backward_op : conv3d_double_grad @@ -307,7 +303,6 @@ param: [input, filter, grad_out] kernel : func : conv3d_double_grad - use_gpudnn : true optional : grad_input_grad, grad_filter_grad - backward_op : conv3d_grad @@ -319,7 +314,6 @@ param : [input, filter] kernel : func : conv3d_grad - use_gpudnn : true backward : conv3d_double_grad - backward_op : conv3d_transpose_grad @@ -330,7 +324,6 @@ func : ConvTransposeGradInferMeta kernel : func : conv3d_transpose_grad - use_gpudnn : true - backward_op : crop_grad forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out) @@ -401,7 +394,6 @@ kernel : func : depthwise_conv2d_grad param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : True backward : depthwise_conv2d_double_grad - backward_op : depthwise_conv2d_transpose_grad @@ -1210,8 +1202,8 @@ func : pixel_shuffle_grad - backward_op : pool2d_double_grad - forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(grad_x) + args : (Tensor x, Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(grad_out_grad) infer_meta : func : Pool2DInferMeta @@ -1219,11 +1211,11 @@ kernel : func : pool2d_double_grad param : [grad_x_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn + no_need_buffer : x - backward_op : pool2d_grad - forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -1231,12 +1223,11 @@ kernel : func : pool2d_grad param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool2d_double_grad - backward_op : pool3d_grad - forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -1244,7 +1235,6 @@ kernel : func : pool3d_grad param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn - backward_op : pow_double_grad forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x) @@ -1601,7 +1591,6 @@ param : [out] kernel : func : softmax_grad - use_gpudnn : true - backward_op : spectral_norm_grad forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 7fb2c244105..5f7bc550083 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -97,7 +97,7 @@ backward : addmm_grad - op : affine_grid - args : (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) + args : (Tensor input, IntArray outputShape, bool align_corners=true) output : Tensor infer_meta : func : AffineGridInferMeta @@ -106,7 +106,6 @@ func : affine_grid param : [input, outputShape, align_corners] data_type : input - use_gpudnn: use_cudnn backward : affine_grid_grad - op : all @@ -431,7 +430,6 @@ func : ConvInferMeta kernel : func : conv2d - use_gpudnn : true backward : conv2d_grad - op : conv2d_transpose @@ -441,7 +439,6 @@ func : Conv2dTransposeInferMeta kernel : func : conv2d_transpose - use_gpudnn : true backward : conv2d_transpose_grad - op : conv3d @@ -451,7 +448,6 @@ func : Conv3DInferMeta kernel : func : conv3d - use_gpudnn : true backward : conv3d_grad - op : conv3d_transpose @@ -461,7 +457,6 @@ func : ConvTransposeInferMeta kernel : func : conv3d_transpose - use_gpudnn : true backward : conv3d_transpose_grad - op : copy_to @@ -540,7 +535,6 @@ kernel : func : depthwise_conv2d param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : true backward : depthwise_conv2d_grad - op : depthwise_conv2d_transpose @@ -1636,7 +1630,7 @@ backward : pixel_shuffle_grad - op : pool2d - args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(out) infer_meta : func : Pool2DInferMeta @@ -1644,11 +1638,10 @@ kernel : func : pool2d param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool2d_grad - op : pool3d - args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(out) infer_meta : func : PoolInferMeta @@ -1656,7 +1649,6 @@ kernel : func : pool3d param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool3d_grad - op : pow @@ -2048,7 +2040,6 @@ func : SoftmaxInferMeta kernel : func : softmax - use_gpudnn : true inplace : (x -> out) backward : softmax_grad diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 3fbf3560aff..09ce2414150 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -200,9 +200,10 @@ void DenseTensor::set_meta(const DenseTensorMeta& meta) { meta_.layout = meta.layout; meta_.lod = meta.lod; meta_.offset = meta.offset; + meta_.use_cudnn = meta.use_cudnn; } -/* @jim19930609: This interface will be further modified util we finalized the +/* @jim19930609: This interface will be further modified until we finalized the design for Allocator - Allocation For now, we have to temporarily accommodate two independent use cases: 1. Designed behaviour: DenseTensor constructed with its underlying storage_ diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index c8998f65efb..3906282187d 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -357,6 +357,7 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { meta_.dtype = src.meta_.dtype; meta_.layout = src.meta_.layout; meta_.offset = src.meta_.offset; + meta_.use_cudnn = src.meta_.use_cudnn; storage_properties_ = std::move(CopyStorageProperties(src.storage_properties_)); #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 3370e9b8058..0d43d318902 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -106,17 +106,16 @@ bool KernelFactory::HasKernel(const std::string& kernel_name, } KernelResult KernelFactory::SelectKernelOrThrowError( - const std::string& kernel_name, - const KernelKey& kernel_key, - bool use_gpudnn) const { + const std::string& kernel_name, const KernelKey& const_kernel_key) const { auto iter = kernels_.find(kernel_name); PADDLE_ENFORCE_NE( iter, kernels_.end(), phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); + KernelKey kernel_key = const_kernel_key; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (use_gpudnn && kernel_key.backend() == Backend::GPU) { + if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, kernel_key.layout(), kernel_key.dtype()}); if (kernel_iter == iter->second.end() && @@ -127,8 +126,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError( if (kernel_iter != iter->second.end()) { return {kernel_iter->second, false}; } - VLOG(3) << "The cudnn kernel for [" << kernel_name - << "] is not registered."; + kernel_key = + KernelKey(Backend::GPU, kernel_key.layout(), kernel_key.dtype()); } #endif auto kernel_iter = iter->second.find(kernel_key); diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index 423c2b8f0a5..69baf243e68 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -274,8 +274,7 @@ class KernelFactory { bool HasCompatiblePhiKernel(const std::string& op_type) const; KernelResult SelectKernelOrThrowError(const std::string& kernel_name, - const KernelKey& kernel_key, - bool use_gpudnn = false) const; + const KernelKey& kernel_key) const; bool HasKernel(const std::string& kernel_name, const KernelKey& kernel_key) const; diff --git a/paddle/phi/core/tensor_meta.cc b/paddle/phi/core/tensor_meta.cc index da088025768..44b2dee358a 100644 --- a/paddle/phi/core/tensor_meta.cc +++ b/paddle/phi/core/tensor_meta.cc @@ -16,21 +16,29 @@ limitations under the License. */ namespace phi { +DenseTensorMeta::DenseTensorMeta() { use_cudnn = true; } + DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims) - : dims(dims), dtype(dtype) {} + : dims(dims), dtype(dtype) { + use_cudnn = true; +} DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, size_t offset) - : dims(dims), dtype(dtype), layout(layout), offset(offset) {} + : dims(dims), dtype(dtype), layout(layout), offset(offset) { + use_cudnn = true; +} DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, const LoD& lod, size_t offset) - : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {} + : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) { + use_cudnn = true; +} bool DenseTensorMeta::valid() const noexcept { bool valid{true}; diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h index 71272235db1..789a4422e25 100644 --- a/paddle/phi/core/tensor_meta.h +++ b/paddle/phi/core/tensor_meta.h @@ -48,7 +48,7 @@ using LoD = std::vector>; struct DenseTensorMeta { using DataType = paddle::experimental::DataType; - DenseTensorMeta() = default; + DenseTensorMeta(); DenseTensorMeta(DataType dtype, const DDim& dims); DenseTensorMeta(DataType dtype, const DDim& dims, @@ -65,6 +65,9 @@ struct DenseTensorMeta { bool valid() const noexcept; bool is_scalar{false}; + /// \brief Determine whether using CuDNN speed-up library in the new dygraph. + /// It maybe also support MKLDNN library in the near future. + bool use_cudnn{true}; DDim dims; DataType dtype{DataType::UNDEFINED}; DataLayout layout{DataLayout::NCHW}; @@ -73,9 +76,10 @@ struct DenseTensorMeta { }; inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) { - return (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) && - (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) && - (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset); + return (lhs.is_scalar == rhs.is_scalar) && lhs.use_cudnn == rhs.use_cudnn && + (lhs.dims == rhs.dims) && (lhs.dtype == rhs.dtype) && + (lhs.layout == rhs.layout) && (lhs.lod == rhs.lod) && + (lhs.offset == rhs.offset); } struct StringTensorMeta { diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 4c8b9d7f555..4cbe12698c5 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -672,6 +672,7 @@ class Pool2D(layers.Layer): def forward(self, input): if _non_static_mode(): if not self._use_mkldnn and in_dygraph_mode(): + input = input._use_cudnn(self._use_cudnn) return _C_ops.pool2d( input, self._pool_size, @@ -684,7 +685,6 @@ class Pool2D(layers.Layer): self._global_pooling, False, "EXPLICIT", - self._use_cudnn, ) attrs = ( diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 7c7aa964cf8..6fa46692c79 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -880,6 +880,10 @@ def monkey_patch_varbase(): def _clear_data(self): self.get_tensor()._clear() + @framework.dygraph_only + def _use_cudnn(self, use_cudnn=True): + return self._tensor_use_cudnn(use_cudnn) + @framework.dygraph_only def _uva(self, device_id=0): ''' @@ -1064,6 +1068,7 @@ def monkey_patch_varbase(): setattr(core.eager.Tensor, "_uva", _uva) setattr(core.eager.Tensor, "_clear_data", _clear_data) setattr(core.eager.Tensor, "__hash__", __hash__) + setattr(core.eager.Tensor, "_use_cudnn", _use_cudnn) else: setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "grad", grad) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bd5b11e1364..71f5702a7cc 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2196,6 +2196,7 @@ def pool2d( pool_padding = update_padding(pool_padding, data_format) if in_dygraph_mode(): + input = input._use_cudnn(use_cudnn) return _C_ops.pool2d( input, pool_size, @@ -2208,7 +2209,6 @@ def pool2d( global_pooling, False, padding_algorithm, - use_cudnn, ) op_type = 'pool2d' helper = LayerHelper(op_type, **locals()) diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index eee9e8eac4b..4471d78936a 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -897,6 +897,21 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): x._clear() self.assertFalse(x._is_initialized()) + def test_use_cudnn(self): + np_x = np.random.random((3, 8, 8)) + with _test_eager_guard(): + self.assertTrue(in_dygraph_mode()) + x = paddle.to_tensor(np_x, dtype="float64") + y = x._use_cudnn(False) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + y = x._use_cudnn(True) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + + self.assertFalse(in_dygraph_mode()) + x = paddle.to_tensor(np_x, dtype="float64") + with self.assertRaises(AttributeError): + x = x._use_cudnn(False) + class EagerParamBaseUsageTestCase(unittest.TestCase): def test_print(self): diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index 5e8f77a9810..f30be705207 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -258,7 +258,6 @@ def avg_pool1d( False, False, padding_algorithm, - True, ) return squeeze(output, [2]) @@ -407,7 +406,6 @@ def avg_pool2d( False, False, padding_algorithm, - True, ) else: output = _legacy_C_ops.pool2d( @@ -561,7 +559,6 @@ def avg_pool3d( False, False, padding_algorithm, - True, ) elif _in_legacy_dygraph(): pool_out = _legacy_C_ops.pool3d( @@ -718,7 +715,6 @@ def max_pool1d( False, False, padding_algorithm, - True, ) return squeeze(pool_out, [2]) @@ -1363,7 +1359,6 @@ def max_pool2d( False, False, padding_algorithm, - True, ) if _in_legacy_dygraph(): @@ -1554,7 +1549,6 @@ def max_pool3d( False, False, padding_algorithm, - True, ) if _in_legacy_dygraph(): @@ -1691,6 +1685,7 @@ def adaptive_avg_pool1d(x, output_size, name=None): x = unsqueeze(x, [2]) if in_dygraph_mode(): + x = x._use_cudnn(False) pool_out = _C_ops.pool2d( x, pool_size, @@ -1703,7 +1698,6 @@ def adaptive_avg_pool1d(x, output_size, name=None): False, True, "EXPLICIT", - False, ) return squeeze(pool_out, [2]) if _in_legacy_dygraph(): @@ -1828,6 +1822,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): output_size = utils._convert_to_tensor_list(output_size) if in_dygraph_mode(): + x = x._use_cudnn(False) return _C_ops.pool2d( x, output_size, @@ -1840,7 +1835,6 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): False, True, "EXPLICIT", - False, ) if _in_legacy_dygraph(): @@ -1973,6 +1967,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): output_size[2] = in_w if in_dygraph_mode(): + x = x._use_cudnn(False) return _C_ops.pool3d( x, output_size, @@ -1985,7 +1980,6 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): False, True, "EXPLICIT", - False, ) elif _in_legacy_dygraph(): return _legacy_C_ops.pool3d( diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py index ebe1ec7e9bc..6d061ff6294 100644 --- a/python/paddle/nn/functional/vision.py +++ b/python/paddle/nn/functional/vision.py @@ -92,7 +92,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None): if isinstance(out_shape, Variable) else out_shape ) - return _C_ops.affine_grid(theta, _out_shape, align_corners, use_cudnn) + theta = theta._use_cudnn(use_cudnn) + return _C_ops.affine_grid(theta, _out_shape, align_corners) elif in_dynamic_mode(): _out_shape = ( out_shape.numpy().tolist() -- GitLab