提交 b77f9fbf 编写于 作者: Z zchen0211

deconv2d cudnn

上级 a349bee6
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "glog/logging.h"
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
...@@ -69,13 +68,6 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -69,13 +68,6 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
int input_channels = input->dims()[1]; // M
int input_height = input->dims()[2]; // H
int input_width = input->dims()[3]; // W
int output_channels = output->dims()[1]; // C
int output_height = output->dims()[2]; // O_H
int output_width = output->dims()[3]; // O_W
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
...@@ -118,7 +110,6 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -118,7 +110,6 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
} }
}; };
/*
template <typename T> template <typename T>
class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -130,7 +121,6 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -130,7 +121,6 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input")); auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter")); auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* output_grad_data = output_grad->data<T>(); const T* output_grad_data = output_grad->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
...@@ -138,47 +128,33 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -138,47 +128,33 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB"); int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc; ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_grad_desc; ScopedTensorDescriptor output_desc;
ScopedTensorDescriptor input_grad_desc;
ScopedFilterDescriptor filter_desc; ScopedFilterDescriptor filter_desc;
ScopedFilterDescriptor filter_grad_desc;
ScopedConvolutionDescriptor conv_desc; ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW; DataLayout layout = DataLayout::kNCHW;
// Input: (N, M, H, W)
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()), groups); layout, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t cudnn_output_grad_desc = // Output: (N, C, O_H, O_W)
output_grad_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output_grad->dims()), groups); layout, framework::vectorize2int(output_grad->dims()));
// Filter (M, C, K_H, K_W)
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups); layout, framework::vectorize2int(filter->dims()));
cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
int input_channels = input->dims()[1];
int input_height = input->dims()[2];
int input_width = input->dims()[3];
int output_grad_channels = filter->dims()[0];
int output_grad_height = output_grad->dims()[2];
int output_grad_width = output_grad->dims()[3];
int group_offset_in = input_channels / groups * input_height * input_width;
int group_offset_out =
output_grad_channels / groups * output_grad_height * output_grad_width;
int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn backward algorithm --------------------- // ------------------- cudnn backward algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t data_algo; cudnnConvolutionFwdAlgo_t data_algo;
cudnnConvolutionBwdFilterAlgo_t filter_algo; cudnnConvolutionBwdFilterAlgo_t filter_algo;
size_t workspace_size_in_bytes = 0, tmp_size = 0; size_t bwd_filter_ws_size, fwd_ws_size;
size_t workspace_size_in_bytes = 0;
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) { if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024; workspace_size_limit = user_workspace_size * 1024 * 1024;
...@@ -186,42 +162,35 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -186,42 +162,35 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto handle = ctx.cuda_device_context().cudnn_handle(); auto handle = ctx.cuda_device_context().cudnn_handle();
if (input_grad) { if (input_grad) {
cudnn_input_grad_desc = input_grad_desc.descriptor<T>( // choose backward algorithm for data
layout, framework::vectorize2int(input_grad->dims()), groups); PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
PADDLE_ENFORCE( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
handle, cudnn_filter_desc, workspace_size_limit, &data_algo));
// dyDesc: Handle to the previously initialized input differential PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
// tensor descriptor. handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_grad_desc, cudnn_conv_desc, cudnn_input_desc, data_algo, &fwd_ws_size));
// dxDesc: Handle to the previously initialized output tensor workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
// descriptor.
cudnn_input_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
if (filter_grad) { if (filter_grad) {
cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>( // choose backward algorithm for filter
layout, framework::vectorize2int(filter_grad->dims()), groups);
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo)); workspace_size_limit, &filter_algo));
// get workspace for backwards filter algorithm
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &tmp_size)); cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes =
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
} }
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
...@@ -235,35 +204,30 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -235,35 +204,30 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto t = framework::EigenVector<T>::Flatten(*input_grad); auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) = t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0)); t.constant(static_cast<T>(0));
for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_filter_desc, handle, &alpha, cudnn_output_desc, output_grad_data,
filter_data + i * group_offset_filter, cudnn_output_grad_desc, cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, input_grad_data));
cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
}
} }
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*filter_grad); auto t = framework::EigenVector<T>::Flatten(*filter_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) = t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0)); t.constant(static_cast<T>(0));
for (int i = 0; i < groups; i++) { // Gradient with respect to the filter
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out, input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
cudnn_conv_desc, filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
filter_grad_data + i * group_offset_filter));
}
} }
// Release the cudnn workspace // Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace); paddle::memory::Free(gpu, cudnn_workspace);
} }
}; };
*/
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -272,5 +236,5 @@ namespace ops = paddle::operators; ...@@ -272,5 +236,5 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2dtranspose_cudnn, REGISTER_OP_GPU_KERNEL(conv2dtranspose_cudnn,
ops::CudnnConvTransposeOpKernel<float>); ops::CudnnConvTransposeOpKernel<float>);
// REGISTER_OP_GPU_KERNEL(conv2dtranspose_cudnn_grad, REGISTER_OP_GPU_KERNEL(conv2dtranspose_cudnn_grad,
// ops::CudnnConvTransposeGradOpKernel<float>); ops::CudnnConvTransposeGradOpKernel<float>);
...@@ -56,27 +56,9 @@ class TestConv2dTransposeOp(OpTest): ...@@ -56,27 +56,9 @@ class TestConv2dTransposeOp(OpTest):
self.outputs = {'Output': output} self.outputs = {'Output': output}
def test_check_output(self): def test_check_output(self):
print 'check output here' print 'check output here for', self.op_type
self.check_output() self.check_output()
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Input']))
def init_test_case(self): def init_test_case(self):
self.pad = [0, 0] self.pad = [0, 0]
self.stride = [1, 1] self.stride = [1, 1]
...@@ -88,15 +70,29 @@ class TestConv2dTransposeOp(OpTest): ...@@ -88,15 +70,29 @@ class TestConv2dTransposeOp(OpTest):
def init_op_type(self): def init_op_type(self):
self.op_type = "conv2dtranspose" self.op_type = "conv2dtranspose"
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Input']))
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Filter']))
""" def test_check_grad(self):
class TestCudnn(TestConv2dOp): self.check_grad(
def init_group(self): set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
self.groups = 1
class TestCudnn(TestConv2dTransposeOp):
def init_op_type(self): def init_op_type(self):
self.op_type = "conv_cudnn" self.op_type = "conv2dtranspose_cudnn"
"""
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册