未验证 提交 1623f1b4 编写于 作者: H HongyuJia 提交者: GitHub

[Phi Support CuDNN] Support ALL CuDNN (#47865)

* support default use_gpudnn=True

* fully support cudnn in phi

* add header file

* add white_list, verify accuracy

* phi support all cudnn

* opt affine_grad

* try different arches of pretrained_model

* try different arches of pretrained_model

* add debug string

* debug eager_method

* add debug string, pass all local ctest

* polish all debug code

* delete use_cudnn relevant code autogen

* fix depthwise_conv2d

* Share all other members of Tensor except use_cudnn

* polish codes according to review opinion

* polish codes according to review opinion, fix bug

* polish codes according to review opinion, opt performance

* polish codes according to review opinion, fix pooling.py
上级 6bdf1261
...@@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t; ...@@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t;
#include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -1444,6 +1445,43 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, ...@@ -1444,6 +1445,43 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor__use_cudnn(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
PADDLE_ENFORCE(self->tensor.defined() && self->tensor.is_dense_tensor(),
paddle::platform::errors::Fatal(
"function _use_cudnn is only effective for DenseTensor"));
bool use_cudnn = pybind::CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
// Set the same use_cudnn attribute, return directly
phi::DenseTensor* dense_tensor =
static_cast<phi::DenseTensor*>(self->tensor.impl().get());
phi::DenseTensorMeta* dense_tensor_meta =
phi::DenseTensorUtils::GetMutableMeta(dense_tensor);
if (use_cudnn == dense_tensor_meta->use_cudnn) {
return ToPyObject(self->tensor);
}
// Share all other members of Tensor except use_cudnn
phi::DenseTensorMeta target_dense_meta = *dense_tensor_meta;
target_dense_meta.use_cudnn = use_cudnn;
phi::DenseTensor target_dense_tensor;
target_dense_tensor.ShareDataWith(*dense_tensor);
target_dense_tensor.set_meta(target_dense_meta);
// Construct returned tensor
paddle::experimental::Tensor target_tensor(
std::make_shared<phi::DenseTensor>(target_dense_tensor),
self->tensor.name());
target_tensor.set_autograd_meta(self->tensor.mutable_autograd_meta());
VLOG(4) << "Tensor: " << target_tensor.name()
<< " set use_cudnn = " << use_cudnn;
return ToPyObject(target_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_set_vocab(TensorObject* self, static PyObject* tensor_method_set_vocab(TensorObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
...@@ -2010,6 +2048,10 @@ PyMethodDef variable_methods[] = { ...@@ -2010,6 +2048,10 @@ PyMethodDef variable_methods[] = {
(PyCFunction)(void (*)(void))tensor__copy_gradient_from, (PyCFunction)(void (*)(void))tensor__copy_gradient_from,
METH_VARARGS | METH_KEYWORDS, METH_VARARGS | METH_KEYWORDS,
NULL}, NULL},
{"_tensor_use_cudnn",
(PyCFunction)(void (*)(void))tensor__use_cudnn,
METH_VARARGS | METH_KEYWORDS,
NULL},
/** the methods to adapt old dygraph, will be removed in the future **/ /** the methods to adapt old dygraph, will be removed in the future **/
{"set_string_list", {"set_string_list",
(PyCFunction)(void (*)(void))tensor_method_set_string_list, (PyCFunction)(void (*)(void))tensor_method_set_string_list,
......
...@@ -54,14 +54,11 @@ bool HasAllocation(const phi::TensorBase& t) { ...@@ -54,14 +54,11 @@ bool HasAllocation(const phi::TensorBase& t) {
BackendSet GetTensorBackendSet(const phi::TensorBase& t) { BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
BackendSet backend_set(phi::TransToPhiBackend(t.place())); phi::Backend backend_key = phi::TransToPhiBackend(t.place());
switch (t.layout()) { BackendSet backend_set(backend_key);
case DataLayout::ONEDNN: if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) &&
backend_set = backend_set | BackendSet(Backend::ONEDNN); static_cast<const phi::DenseTensor&>(t).meta().use_cudnn) {
break; backend_set = backend_set | BackendSet(Backend::GPUDNN);
default:
// do nothing
break;
} }
return backend_set; return backend_set;
} }
...@@ -126,7 +123,13 @@ Backend ParseBackend(const Place& place) { ...@@ -126,7 +123,13 @@ Backend ParseBackend(const Place& place) {
return phi::TransToPhiBackend(place); return phi::TransToPhiBackend(place);
} }
Backend ParseBackend(const Tensor& tensor) { Backend ParseBackend(const Tensor& tensor) {
return phi::TransToPhiBackend(tensor.place()); Backend backend_key = phi::TransToPhiBackend(tensor.place());
if (backend_key == Backend::GPU &&
phi::DenseTensor::classof(tensor.impl().get()) &&
static_cast<phi::DenseTensor*>(tensor.impl().get())->meta().use_cudnn) {
return Backend::GPUDNN;
}
return backend_key;
} }
Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) { Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
......
...@@ -90,6 +90,7 @@ struct ArgsIterator { ...@@ -90,6 +90,7 @@ struct ArgsIterator {
struct KernelKeyParser : ArgsIterator<KernelKeyParser> { struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
KernelKeySet key_set; KernelKeySet key_set;
bool disable_cudnn = false;
// this dtype_set is used for cache multi-inputs dtype and used for // this dtype_set is used for cache multi-inputs dtype and used for
// data_promote // data_promote
DataTypeSet dtype_set{DataType::UNDEFINED}; DataTypeSet dtype_set{DataType::UNDEFINED};
...@@ -97,11 +98,19 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> { ...@@ -97,11 +98,19 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
// TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): deal with multiple diff input Tensors
// TODO(chenweihang): add global device guard method to set backend // TODO(chenweihang): add global device guard method to set backend
inline void AssignKernelKeySet(const phi::TensorBase& tensor) { inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
key_set.backend_set = // assign Backend
key_set.backend_set | detail::GetTensorBackendSet(tensor); BackendSet tensor_backend_set = detail::GetTensorBackendSet(tensor);
key_set.backend_set = key_set.backend_set | tensor_backend_set;
// tensor's attribute use_cudnn=False, explicitly disable cudnn kernel
if (tensor_backend_set == BackendSet(Backend::GPU) || disable_cudnn) {
disable_cudnn = true;
key_set.backend_set = key_set.backend_set - BackendSet(Backend::GPUDNN);
}
// assign DataLayout
phi::DataLayout tensor_layout = tensor.layout(); phi::DataLayout tensor_layout = tensor.layout();
key_set.layout = key_set.layout =
tensor_layout > key_set.layout ? tensor_layout : key_set.layout; tensor_layout > key_set.layout ? tensor_layout : key_set.layout;
// assign DataType
key_set.dtype = tensor.dtype(); key_set.dtype = tensor.dtype();
dtype_set = dtype_set | DataTypeSet(key_set.dtype); dtype_set = dtype_set | DataTypeSet(key_set.dtype);
auto promote_result = PromoteTypes(dtype_set); auto promote_result = PromoteTypes(dtype_set);
......
...@@ -307,7 +307,6 @@ class BaseAPI: ...@@ -307,7 +307,6 @@ class BaseAPI:
'backend': None, 'backend': None,
'layout': None, 'layout': None,
'data_type': None, 'data_type': None,
'use_gpudnn': 'false',
'dispatch': {}, 'dispatch': {},
} }
if 'backend' in kernel_config and len(kernel_config['backend']) > 0: if 'backend' in kernel_config and len(kernel_config['backend']) > 0:
...@@ -318,10 +317,6 @@ class BaseAPI: ...@@ -318,10 +317,6 @@ class BaseAPI:
kernel['data_type'] = kernel_config['data_type'] kernel['data_type'] = kernel_config['data_type']
if 'param' in kernel_config: if 'param' in kernel_config:
kernel['param'] = kernel_config['param'] kernel['param'] = kernel_config['param']
if 'use_gpudnn' in kernel_config:
kernel['use_gpudnn'] = kernel_config['use_gpudnn']
if isinstance(kernel['use_gpudnn'], bool):
kernel['use_gpudnn'] = str(kernel['use_gpudnn']).lower()
kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall( kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall(
kernel_config['func'] kernel_config['func']
) )
...@@ -1124,15 +1119,10 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -1124,15 +1119,10 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
for kernel_out in outputs_args: for kernel_out in outputs_args:
fallback_kernel_output_trans += f""" fallback_kernel_output_trans += f"""
{code_indent} TransDataBackend({kernel_out}, kernel_backend, {kernel_out});""" {code_indent} TransDataBackend({kernel_out}, kernel_backend, {kernel_out});"""
cudnn_args = (
''
if self.kernel['use_gpudnn'] == 'false'
else ', ' + self.kernel['use_gpudnn']
)
return f""" return f"""
{code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
{code_indent} auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}{cudnn_args}); {code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}});
{code_indent} const auto& kernel = kernel_result.kernel; {code_indent} const auto& kernel = kernel_result.kernel;
{code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel; {code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel;
{code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); {code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
......
...@@ -67,8 +67,8 @@ ...@@ -67,8 +67,8 @@
func : addmm_grad func : addmm_grad
- backward_op : affine_grid_grad - backward_op : affine_grid_grad
forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) -> Tensor(output) forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true) -> Tensor(output)
args : (Tensor output_grad, IntArray outputShape, bool use_cudnn=true, bool align_corners=true) args : (Tensor input, Tensor output_grad, IntArray outputShape, bool align_corners=true)
output : Tensor(input_grad) output : Tensor(input_grad)
infer_meta : infer_meta :
func : AffineGridGradInferMeta func : AffineGridGradInferMeta
...@@ -76,7 +76,7 @@ ...@@ -76,7 +76,7 @@
kernel : kernel :
func : affine_grid_grad func : affine_grid_grad
param : [output_grad, outputShape, align_corners] param : [output_grad, outputShape, align_corners]
use_gpudnn: use_cudnn no_need_buffer : input
- backward_op : amax_grad - backward_op : amax_grad
forward: amax (Tensor x, int64_t[] axis={}, bool keepdim=false) -> Tensor(out) forward: amax (Tensor x, int64_t[] axis={}, bool keepdim=false) -> Tensor(out)
...@@ -262,7 +262,6 @@ ...@@ -262,7 +262,6 @@
param : [input, filter] param : [input, filter]
kernel : kernel :
func : conv2d_grad func : conv2d_grad
use_gpudnn : true
backward : conv2d_grad_grad backward : conv2d_grad_grad
- backward_op : conv2d_grad_grad - backward_op : conv2d_grad_grad
...@@ -274,7 +273,6 @@ ...@@ -274,7 +273,6 @@
param: [input, filter, grad_out] param: [input, filter, grad_out]
kernel : kernel :
func : conv2d_grad_grad func : conv2d_grad_grad
use_gpudnn : true
optional : grad_input_grad, grad_filter_grad optional : grad_input_grad, grad_filter_grad
- backward_op : conv2d_transpose_double_grad - backward_op : conv2d_transpose_double_grad
...@@ -285,7 +283,6 @@ ...@@ -285,7 +283,6 @@
func : Conv2dTransposeDoubleGradInferMeta func : Conv2dTransposeDoubleGradInferMeta
kernel : kernel :
func : conv2d_transpose_grad_grad func : conv2d_transpose_grad_grad
use_gpudnn : true
- backward_op : conv2d_transpose_grad - backward_op : conv2d_transpose_grad
forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
...@@ -295,7 +292,6 @@ ...@@ -295,7 +292,6 @@
func : Conv2dTransposeGradInferMeta func : Conv2dTransposeGradInferMeta
kernel : kernel :
func : conv2d_transpose_grad func : conv2d_transpose_grad
use_gpudnn : true
backward : conv2d_transpose_double_grad backward : conv2d_transpose_double_grad
- backward_op : conv3d_double_grad - backward_op : conv3d_double_grad
...@@ -307,7 +303,6 @@ ...@@ -307,7 +303,6 @@
param: [input, filter, grad_out] param: [input, filter, grad_out]
kernel : kernel :
func : conv3d_double_grad func : conv3d_double_grad
use_gpudnn : true
optional : grad_input_grad, grad_filter_grad optional : grad_input_grad, grad_filter_grad
- backward_op : conv3d_grad - backward_op : conv3d_grad
...@@ -319,7 +314,6 @@ ...@@ -319,7 +314,6 @@
param : [input, filter] param : [input, filter]
kernel : kernel :
func : conv3d_grad func : conv3d_grad
use_gpudnn : true
backward : conv3d_double_grad backward : conv3d_double_grad
- backward_op : conv3d_transpose_grad - backward_op : conv3d_transpose_grad
...@@ -330,7 +324,6 @@ ...@@ -330,7 +324,6 @@
func : ConvTransposeGradInferMeta func : ConvTransposeGradInferMeta
kernel : kernel :
func : conv3d_transpose_grad func : conv3d_transpose_grad
use_gpudnn : true
- backward_op : crop_grad - backward_op : crop_grad
forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out) forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out)
...@@ -401,7 +394,6 @@ ...@@ -401,7 +394,6 @@
kernel : kernel :
func : depthwise_conv2d_grad func : depthwise_conv2d_grad
param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format] param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format]
use_gpudnn : True
backward : depthwise_conv2d_double_grad backward : depthwise_conv2d_double_grad
- backward_op : depthwise_conv2d_transpose_grad - backward_op : depthwise_conv2d_transpose_grad
...@@ -1210,8 +1202,8 @@ ...@@ -1210,8 +1202,8 @@
func : pixel_shuffle_grad func : pixel_shuffle_grad
- backward_op : pool2d_double_grad - backward_op : pool2d_double_grad
forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(grad_x) forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(grad_x)
args : (Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) args : (Tensor x, Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(grad_out_grad) output : Tensor(grad_out_grad)
infer_meta : infer_meta :
func : Pool2DInferMeta func : Pool2DInferMeta
...@@ -1219,11 +1211,11 @@ ...@@ -1219,11 +1211,11 @@
kernel : kernel :
func : pool2d_double_grad func : pool2d_double_grad
param : [grad_x_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] param : [grad_x_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
use_gpudnn : use_gpudnn no_need_buffer : x
- backward_op : pool2d_grad - backward_op : pool2d_grad
forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(x_grad) output : Tensor(x_grad)
infer_meta : infer_meta :
func : UnchangedInferMeta func : UnchangedInferMeta
...@@ -1231,12 +1223,11 @@ ...@@ -1231,12 +1223,11 @@
kernel : kernel :
func : pool2d_grad func : pool2d_grad
param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
use_gpudnn : use_gpudnn
backward : pool2d_double_grad backward : pool2d_double_grad
- backward_op : pool3d_grad - backward_op : pool3d_grad
forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(x_grad) output : Tensor(x_grad)
infer_meta : infer_meta :
func : UnchangedInferMeta func : UnchangedInferMeta
...@@ -1244,7 +1235,6 @@ ...@@ -1244,7 +1235,6 @@
kernel : kernel :
func : pool3d_grad func : pool3d_grad
param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
use_gpudnn : use_gpudnn
- backward_op : pow_double_grad - backward_op : pow_double_grad
forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x) forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x)
...@@ -1601,7 +1591,6 @@ ...@@ -1601,7 +1591,6 @@
param : [out] param : [out]
kernel : kernel :
func : softmax_grad func : softmax_grad
use_gpudnn : true
- backward_op : spectral_norm_grad - backward_op : spectral_norm_grad
forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out) forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out)
......
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
backward : addmm_grad backward : addmm_grad
- op : affine_grid - op : affine_grid
args : (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) args : (Tensor input, IntArray outputShape, bool align_corners=true)
output : Tensor output : Tensor
infer_meta : infer_meta :
func : AffineGridInferMeta func : AffineGridInferMeta
...@@ -106,7 +106,6 @@ ...@@ -106,7 +106,6 @@
func : affine_grid func : affine_grid
param : [input, outputShape, align_corners] param : [input, outputShape, align_corners]
data_type : input data_type : input
use_gpudnn: use_cudnn
backward : affine_grid_grad backward : affine_grid_grad
- op : all - op : all
...@@ -431,7 +430,6 @@ ...@@ -431,7 +430,6 @@
func : ConvInferMeta func : ConvInferMeta
kernel : kernel :
func : conv2d func : conv2d
use_gpudnn : true
backward : conv2d_grad backward : conv2d_grad
- op : conv2d_transpose - op : conv2d_transpose
...@@ -441,7 +439,6 @@ ...@@ -441,7 +439,6 @@
func : Conv2dTransposeInferMeta func : Conv2dTransposeInferMeta
kernel : kernel :
func : conv2d_transpose func : conv2d_transpose
use_gpudnn : true
backward : conv2d_transpose_grad backward : conv2d_transpose_grad
- op : conv3d - op : conv3d
...@@ -451,7 +448,6 @@ ...@@ -451,7 +448,6 @@
func : Conv3DInferMeta func : Conv3DInferMeta
kernel : kernel :
func : conv3d func : conv3d
use_gpudnn : true
backward : conv3d_grad backward : conv3d_grad
- op : conv3d_transpose - op : conv3d_transpose
...@@ -461,7 +457,6 @@ ...@@ -461,7 +457,6 @@
func : ConvTransposeInferMeta func : ConvTransposeInferMeta
kernel : kernel :
func : conv3d_transpose func : conv3d_transpose
use_gpudnn : true
backward : conv3d_transpose_grad backward : conv3d_transpose_grad
- op : copy_to - op : copy_to
...@@ -540,7 +535,6 @@ ...@@ -540,7 +535,6 @@
kernel : kernel :
func : depthwise_conv2d func : depthwise_conv2d
param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format]
use_gpudnn : true
backward : depthwise_conv2d_grad backward : depthwise_conv2d_grad
- op : depthwise_conv2d_transpose - op : depthwise_conv2d_transpose
...@@ -1636,7 +1630,7 @@ ...@@ -1636,7 +1630,7 @@
backward : pixel_shuffle_grad backward : pixel_shuffle_grad
- op : pool2d - op : pool2d
args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(out) output : Tensor(out)
infer_meta : infer_meta :
func : Pool2DInferMeta func : Pool2DInferMeta
...@@ -1644,11 +1638,10 @@ ...@@ -1644,11 +1638,10 @@
kernel : kernel :
func : pool2d func : pool2d
param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
use_gpudnn : use_gpudnn
backward : pool2d_grad backward : pool2d_grad
- op : pool3d - op : pool3d
args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(out) output : Tensor(out)
infer_meta : infer_meta :
func : PoolInferMeta func : PoolInferMeta
...@@ -1656,7 +1649,6 @@ ...@@ -1656,7 +1649,6 @@
kernel : kernel :
func : pool3d func : pool3d
param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
use_gpudnn : use_gpudnn
backward : pool3d_grad backward : pool3d_grad
- op : pow - op : pow
...@@ -2048,7 +2040,6 @@ ...@@ -2048,7 +2040,6 @@
func : SoftmaxInferMeta func : SoftmaxInferMeta
kernel : kernel :
func : softmax func : softmax
use_gpudnn : true
inplace : (x -> out) inplace : (x -> out)
backward : softmax_grad backward : softmax_grad
......
...@@ -200,9 +200,10 @@ void DenseTensor::set_meta(const DenseTensorMeta& meta) { ...@@ -200,9 +200,10 @@ void DenseTensor::set_meta(const DenseTensorMeta& meta) {
meta_.layout = meta.layout; meta_.layout = meta.layout;
meta_.lod = meta.lod; meta_.lod = meta.lod;
meta_.offset = meta.offset; meta_.offset = meta.offset;
meta_.use_cudnn = meta.use_cudnn;
} }
/* @jim19930609: This interface will be further modified util we finalized the /* @jim19930609: This interface will be further modified until we finalized the
design for Allocator - Allocation design for Allocator - Allocation
For now, we have to temporarily accommodate two independent use cases: For now, we have to temporarily accommodate two independent use cases:
1. Designed behaviour: DenseTensor constructed with its underlying storage_ 1. Designed behaviour: DenseTensor constructed with its underlying storage_
......
...@@ -357,6 +357,7 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { ...@@ -357,6 +357,7 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
meta_.dtype = src.meta_.dtype; meta_.dtype = src.meta_.dtype;
meta_.layout = src.meta_.layout; meta_.layout = src.meta_.layout;
meta_.offset = src.meta_.offset; meta_.offset = src.meta_.offset;
meta_.use_cudnn = src.meta_.use_cudnn;
storage_properties_ = storage_properties_ =
std::move(CopyStorageProperties(src.storage_properties_)); std::move(CopyStorageProperties(src.storage_properties_));
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -106,17 +106,16 @@ bool KernelFactory::HasKernel(const std::string& kernel_name, ...@@ -106,17 +106,16 @@ bool KernelFactory::HasKernel(const std::string& kernel_name,
} }
KernelResult KernelFactory::SelectKernelOrThrowError( KernelResult KernelFactory::SelectKernelOrThrowError(
const std::string& kernel_name, const std::string& kernel_name, const KernelKey& const_kernel_key) const {
const KernelKey& kernel_key,
bool use_gpudnn) const {
auto iter = kernels_.find(kernel_name); auto iter = kernels_.find(kernel_name);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
iter, iter,
kernels_.end(), kernels_.end(),
phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name));
KernelKey kernel_key = const_kernel_key;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (use_gpudnn && kernel_key.backend() == Backend::GPU) { if (kernel_key.backend() == Backend::GPUDNN) {
auto kernel_iter = iter->second.find( auto kernel_iter = iter->second.find(
{Backend::GPUDNN, kernel_key.layout(), kernel_key.dtype()}); {Backend::GPUDNN, kernel_key.layout(), kernel_key.dtype()});
if (kernel_iter == iter->second.end() && if (kernel_iter == iter->second.end() &&
...@@ -127,8 +126,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError( ...@@ -127,8 +126,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
if (kernel_iter != iter->second.end()) { if (kernel_iter != iter->second.end()) {
return {kernel_iter->second, false}; return {kernel_iter->second, false};
} }
VLOG(3) << "The cudnn kernel for [" << kernel_name kernel_key =
<< "] is not registered."; KernelKey(Backend::GPU, kernel_key.layout(), kernel_key.dtype());
} }
#endif #endif
auto kernel_iter = iter->second.find(kernel_key); auto kernel_iter = iter->second.find(kernel_key);
......
...@@ -274,8 +274,7 @@ class KernelFactory { ...@@ -274,8 +274,7 @@ class KernelFactory {
bool HasCompatiblePhiKernel(const std::string& op_type) const; bool HasCompatiblePhiKernel(const std::string& op_type) const;
KernelResult SelectKernelOrThrowError(const std::string& kernel_name, KernelResult SelectKernelOrThrowError(const std::string& kernel_name,
const KernelKey& kernel_key, const KernelKey& kernel_key) const;
bool use_gpudnn = false) const;
bool HasKernel(const std::string& kernel_name, bool HasKernel(const std::string& kernel_name,
const KernelKey& kernel_key) const; const KernelKey& kernel_key) const;
......
...@@ -16,21 +16,29 @@ limitations under the License. */ ...@@ -16,21 +16,29 @@ limitations under the License. */
namespace phi { namespace phi {
DenseTensorMeta::DenseTensorMeta() { use_cudnn = true; }
DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims) DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
: dims(dims), dtype(dtype) {} : dims(dims), dtype(dtype) {
use_cudnn = true;
}
DenseTensorMeta::DenseTensorMeta(DataType dtype, DenseTensorMeta::DenseTensorMeta(DataType dtype,
const DDim& dims, const DDim& dims,
DataLayout layout, DataLayout layout,
size_t offset) size_t offset)
: dims(dims), dtype(dtype), layout(layout), offset(offset) {} : dims(dims), dtype(dtype), layout(layout), offset(offset) {
use_cudnn = true;
}
DenseTensorMeta::DenseTensorMeta(DataType dtype, DenseTensorMeta::DenseTensorMeta(DataType dtype,
const DDim& dims, const DDim& dims,
DataLayout layout, DataLayout layout,
const LoD& lod, const LoD& lod,
size_t offset) size_t offset)
: dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {} : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {
use_cudnn = true;
}
bool DenseTensorMeta::valid() const noexcept { bool DenseTensorMeta::valid() const noexcept {
bool valid{true}; bool valid{true};
......
...@@ -48,7 +48,7 @@ using LoD = std::vector<std::vector<size_t>>; ...@@ -48,7 +48,7 @@ using LoD = std::vector<std::vector<size_t>>;
struct DenseTensorMeta { struct DenseTensorMeta {
using DataType = paddle::experimental::DataType; using DataType = paddle::experimental::DataType;
DenseTensorMeta() = default; DenseTensorMeta();
DenseTensorMeta(DataType dtype, const DDim& dims); DenseTensorMeta(DataType dtype, const DDim& dims);
DenseTensorMeta(DataType dtype, DenseTensorMeta(DataType dtype,
const DDim& dims, const DDim& dims,
...@@ -65,6 +65,9 @@ struct DenseTensorMeta { ...@@ -65,6 +65,9 @@ struct DenseTensorMeta {
bool valid() const noexcept; bool valid() const noexcept;
bool is_scalar{false}; bool is_scalar{false};
/// \brief Determine whether using CuDNN speed-up library in the new dygraph.
/// It maybe also support MKLDNN library in the near future.
bool use_cudnn{true};
DDim dims; DDim dims;
DataType dtype{DataType::UNDEFINED}; DataType dtype{DataType::UNDEFINED};
DataLayout layout{DataLayout::NCHW}; DataLayout layout{DataLayout::NCHW};
...@@ -73,9 +76,10 @@ struct DenseTensorMeta { ...@@ -73,9 +76,10 @@ struct DenseTensorMeta {
}; };
inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) { inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
return (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) && return (lhs.is_scalar == rhs.is_scalar) && lhs.use_cudnn == rhs.use_cudnn &&
(lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) && (lhs.dims == rhs.dims) && (lhs.dtype == rhs.dtype) &&
(lhs.lod == rhs.lod) && (lhs.offset == rhs.offset); (lhs.layout == rhs.layout) && (lhs.lod == rhs.lod) &&
(lhs.offset == rhs.offset);
} }
struct StringTensorMeta { struct StringTensorMeta {
......
...@@ -672,6 +672,7 @@ class Pool2D(layers.Layer): ...@@ -672,6 +672,7 @@ class Pool2D(layers.Layer):
def forward(self, input): def forward(self, input):
if _non_static_mode(): if _non_static_mode():
if not self._use_mkldnn and in_dygraph_mode(): if not self._use_mkldnn and in_dygraph_mode():
input = input._use_cudnn(self._use_cudnn)
return _C_ops.pool2d( return _C_ops.pool2d(
input, input,
self._pool_size, self._pool_size,
...@@ -684,7 +685,6 @@ class Pool2D(layers.Layer): ...@@ -684,7 +685,6 @@ class Pool2D(layers.Layer):
self._global_pooling, self._global_pooling,
False, False,
"EXPLICIT", "EXPLICIT",
self._use_cudnn,
) )
attrs = ( attrs = (
......
...@@ -880,6 +880,10 @@ def monkey_patch_varbase(): ...@@ -880,6 +880,10 @@ def monkey_patch_varbase():
def _clear_data(self): def _clear_data(self):
self.get_tensor()._clear() self.get_tensor()._clear()
@framework.dygraph_only
def _use_cudnn(self, use_cudnn=True):
return self._tensor_use_cudnn(use_cudnn)
@framework.dygraph_only @framework.dygraph_only
def _uva(self, device_id=0): def _uva(self, device_id=0):
''' '''
...@@ -1064,6 +1068,7 @@ def monkey_patch_varbase(): ...@@ -1064,6 +1068,7 @@ def monkey_patch_varbase():
setattr(core.eager.Tensor, "_uva", _uva) setattr(core.eager.Tensor, "_uva", _uva)
setattr(core.eager.Tensor, "_clear_data", _clear_data) setattr(core.eager.Tensor, "_clear_data", _clear_data)
setattr(core.eager.Tensor, "__hash__", __hash__) setattr(core.eager.Tensor, "__hash__", __hash__)
setattr(core.eager.Tensor, "_use_cudnn", _use_cudnn)
else: else:
setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "__name__", "Tensor")
setattr(core.VarBase, "grad", grad) setattr(core.VarBase, "grad", grad)
......
...@@ -2196,6 +2196,7 @@ def pool2d( ...@@ -2196,6 +2196,7 @@ def pool2d(
pool_padding = update_padding(pool_padding, data_format) pool_padding = update_padding(pool_padding, data_format)
if in_dygraph_mode(): if in_dygraph_mode():
input = input._use_cudnn(use_cudnn)
return _C_ops.pool2d( return _C_ops.pool2d(
input, input,
pool_size, pool_size,
...@@ -2208,7 +2209,6 @@ def pool2d( ...@@ -2208,7 +2209,6 @@ def pool2d(
global_pooling, global_pooling,
False, False,
padding_algorithm, padding_algorithm,
use_cudnn,
) )
op_type = 'pool2d' op_type = 'pool2d'
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
......
...@@ -897,6 +897,21 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): ...@@ -897,6 +897,21 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
x._clear() x._clear()
self.assertFalse(x._is_initialized()) self.assertFalse(x._is_initialized())
def test_use_cudnn(self):
np_x = np.random.random((3, 8, 8))
with _test_eager_guard():
self.assertTrue(in_dygraph_mode())
x = paddle.to_tensor(np_x, dtype="float64")
y = x._use_cudnn(False)
np.testing.assert_array_equal(x.numpy(), y.numpy())
y = x._use_cudnn(True)
np.testing.assert_array_equal(x.numpy(), y.numpy())
self.assertFalse(in_dygraph_mode())
x = paddle.to_tensor(np_x, dtype="float64")
with self.assertRaises(AttributeError):
x = x._use_cudnn(False)
class EagerParamBaseUsageTestCase(unittest.TestCase): class EagerParamBaseUsageTestCase(unittest.TestCase):
def test_print(self): def test_print(self):
......
...@@ -258,7 +258,6 @@ def avg_pool1d( ...@@ -258,7 +258,6 @@ def avg_pool1d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
return squeeze(output, [2]) return squeeze(output, [2])
...@@ -407,7 +406,6 @@ def avg_pool2d( ...@@ -407,7 +406,6 @@ def avg_pool2d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
else: else:
output = _legacy_C_ops.pool2d( output = _legacy_C_ops.pool2d(
...@@ -561,7 +559,6 @@ def avg_pool3d( ...@@ -561,7 +559,6 @@ def avg_pool3d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
pool_out = _legacy_C_ops.pool3d( pool_out = _legacy_C_ops.pool3d(
...@@ -718,7 +715,6 @@ def max_pool1d( ...@@ -718,7 +715,6 @@ def max_pool1d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
return squeeze(pool_out, [2]) return squeeze(pool_out, [2])
...@@ -1363,7 +1359,6 @@ def max_pool2d( ...@@ -1363,7 +1359,6 @@ def max_pool2d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
if _in_legacy_dygraph(): if _in_legacy_dygraph():
...@@ -1554,7 +1549,6 @@ def max_pool3d( ...@@ -1554,7 +1549,6 @@ def max_pool3d(
False, False,
False, False,
padding_algorithm, padding_algorithm,
True,
) )
if _in_legacy_dygraph(): if _in_legacy_dygraph():
...@@ -1691,6 +1685,7 @@ def adaptive_avg_pool1d(x, output_size, name=None): ...@@ -1691,6 +1685,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
x = unsqueeze(x, [2]) x = unsqueeze(x, [2])
if in_dygraph_mode(): if in_dygraph_mode():
x = x._use_cudnn(False)
pool_out = _C_ops.pool2d( pool_out = _C_ops.pool2d(
x, x,
pool_size, pool_size,
...@@ -1703,7 +1698,6 @@ def adaptive_avg_pool1d(x, output_size, name=None): ...@@ -1703,7 +1698,6 @@ def adaptive_avg_pool1d(x, output_size, name=None):
False, False,
True, True,
"EXPLICIT", "EXPLICIT",
False,
) )
return squeeze(pool_out, [2]) return squeeze(pool_out, [2])
if _in_legacy_dygraph(): if _in_legacy_dygraph():
...@@ -1828,6 +1822,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1828,6 +1822,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
output_size = utils._convert_to_tensor_list(output_size) output_size = utils._convert_to_tensor_list(output_size)
if in_dygraph_mode(): if in_dygraph_mode():
x = x._use_cudnn(False)
return _C_ops.pool2d( return _C_ops.pool2d(
x, x,
output_size, output_size,
...@@ -1840,7 +1835,6 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1840,7 +1835,6 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
False, False,
True, True,
"EXPLICIT", "EXPLICIT",
False,
) )
if _in_legacy_dygraph(): if _in_legacy_dygraph():
...@@ -1973,6 +1967,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): ...@@ -1973,6 +1967,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
output_size[2] = in_w output_size[2] = in_w
if in_dygraph_mode(): if in_dygraph_mode():
x = x._use_cudnn(False)
return _C_ops.pool3d( return _C_ops.pool3d(
x, x,
output_size, output_size,
...@@ -1985,7 +1980,6 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): ...@@ -1985,7 +1980,6 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
False, False,
True, True,
"EXPLICIT", "EXPLICIT",
False,
) )
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
return _legacy_C_ops.pool3d( return _legacy_C_ops.pool3d(
......
...@@ -92,7 +92,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None): ...@@ -92,7 +92,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
if isinstance(out_shape, Variable) if isinstance(out_shape, Variable)
else out_shape else out_shape
) )
return _C_ops.affine_grid(theta, _out_shape, align_corners, use_cudnn) theta = theta._use_cudnn(use_cudnn)
return _C_ops.affine_grid(theta, _out_shape, align_corners)
elif in_dynamic_mode(): elif in_dynamic_mode():
_out_shape = ( _out_shape = (
out_shape.numpy().tolist() out_shape.numpy().tolist()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册