diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index ef722e984c996f4267f5428d6eb6cb77d33468fb..9a3ad8eb642052a848ce5dc895005cf8c597dc7c 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -22,4 +22,6 @@ ENDIF()
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
-nv_test(cudnn_helper SRCS cudnn_helper.cc)
+
+nv_library(cudnn_helper SRCS cudnn_helper.cc DEPS dynload_cuda)
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index 91047236a281a53837862b5b918fdafac6ed8c70..6a43f49cfc3fbfd786017889fd00232446961c0b 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_ONLY_CPU
 #include <cudnn.h>
+#include "glog/logging.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/macros.h"
@@ -93,11 +95,11 @@ class ScopedTensorDescriptor {
     // the format is not used now, but it maybe useful feature
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 1; i >= 0; i++) {
-      strides[i] = dims[i + 1] * strides[i];
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
     }
-    PADDLE_ENFORCE(cudnnSetTensorNdDescriptor(desc_, type, dims.size(),
-                                              dims.data(), strides.data()));
+    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+        desc_, type, dims.size(), dims.data(), strides.data()));
     return desc_;
   }
 
@@ -126,8 +128,8 @@ class ScopedFilterDescriptor {
                                             const cudnnDataType_t type,
                                             const std::vector<int>& kernel) {
     // filter layout: output input spatial_dim_y spatial_dim_x
-    PADDLE_ENFORCE(cudnnSetFilterNdDescriptor(desc_, type, format,
-                                              kernel.size(), kernel.data()));
+    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
+        desc_, type, format, kernel.size(), kernel.data()));
     return desc_;
   }
 
@@ -157,9 +159,21 @@ class ScopedConvolutionDescriptor {
       const std::vector<int>& strides, const std::vector<int>& dilations) {
     PADDLE_ENFORCE_EQ(pads.size(), strides.size());
     PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
-    PADDLE_ENFORCE(cudnnSetConvolutionNdDescriptor(
+
+#if CUDNN_VERSION < 6000
+    // cudnn v5 does not support dilation conv, the argument is called upscale
+    // instead of dilations and it is must be one.
+    for (size_t i = 0; i < dilations.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          dilations[i], 1,
+          "Dilations conv is not supported in this cuDNN version");
+    }
+#endif
+
+    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, type));
+    return desc_;
   }
 
   template <typename T>
@@ -184,26 +198,18 @@ class ScopedPoolingDescriptor {
   }
 
   inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                             cudnnDataType_t type,
                                              const std::vector<int>& kernel,
                                              const std::vector<int>& pads,
                                              const std::vector<int>& strides) {
     PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
     PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
-    PADDLE_ENFORCE(cudnnSetPoolingNdDescriptor(
+    PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
         desc_, (mode == PoolingMode::kMaximum
                     ? CUDNN_POOLING_MAX
                     : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
-  }
-
-  template <typename T>
-  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                             const std::vector<int>& kernel,
-                                             const std::vector<int>& pads,
-                                             const std::vector<int>& strides) {
-    return descriptor(mode, CudnnDataType<T>::type, kernel, pads, strides);
+    return desc_;
   }
 
  private:
@@ -213,3 +219,4 @@ class ScopedPoolingDescriptor {
 
 }  // namespace platform
 }  // namespace paddle
+#endif
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..729f2f8a10fcb1dac5c9510cb1cf0345dc8f542f
--- /dev/null
+++ b/paddle/platform/cudnn_helper_test.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cudnn_helper.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(CudnnHelper, ScopedTensorDescriptor) {
+  using paddle::platform::ScopedTensorDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedTensorDescriptor tensor_desc;
+  std::vector<int> shape = {2, 4, 6, 6};
+  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  std::vector<int> dims(4);
+  std::vector<int> strides(4);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc, 4, &type, &nd, dims.data(), strides.data());
+
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    EXPECT_EQ(dims[i], shape[i]);
+  }
+  EXPECT_EQ(strides[3], 1);
+  EXPECT_EQ(strides[2], 6);
+  EXPECT_EQ(strides[1], 36);
+  EXPECT_EQ(strides[0], 144);
+}
+
+TEST(CudnnHelper, ScopedFilterDescriptor) {
+  using paddle::platform::ScopedFilterDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedFilterDescriptor filter_desc;
+  std::vector<int> shape = {2, 3, 3};
+  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  cudnnTensorFormat_t format;
+  std::vector<int> kernel(3);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
+                                                        &nd, kernel.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    EXPECT_EQ(kernel[i], shape[i]);
+  }
+}
+
+TEST(CudnnHelper, ScopedConvolutionDescriptor) {
+  using paddle::platform::ScopedConvolutionDescriptor;
+
+  ScopedConvolutionDescriptor conv_desc;
+  std::vector<int> src_pads = {2, 2, 2};
+  std::vector<int> src_strides = {1, 1, 1};
+  std::vector<int> src_dilations = {1, 1, 1};
+  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
+
+  cudnnDataType_t type;
+  cudnnConvolutionMode_t mode;
+  int nd;
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  std::vector<int> dilations(3);
+  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
+      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
+      &type);
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+    EXPECT_EQ(dilations[i], src_dilations[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
+}
+
+TEST(CudnnHelper, ScopedPoolingDescriptor) {
+  using paddle::platform::ScopedPoolingDescriptor;
+  using paddle::platform::PoolingMode;
+
+  ScopedPoolingDescriptor pool_desc;
+  std::vector<int> src_kernel = {2, 2, 5};
+  std::vector<int> src_pads = {1, 1, 2};
+  std::vector<int> src_strides = {2, 2, 3};
+  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
+                                   src_strides);
+
+  cudnnPoolingMode_t mode;
+  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
+  int nd;
+  std::vector<int> kernel(3);
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
+      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(kernel[i], src_kernel[i]);
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index d205ead84598e04eea523be32139959a02e0dd83..ceb66f84b6b01892cbaf61c79a47ae60d2589164 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index ef0dd85b083dc2335dd5c70d3dc5f59eda25daeb..0120625b7c14448f1b8deb88c24a3ee06eaf4f01 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -62,19 +62,27 @@ extern void* cudnn_dso_handle;
 #define CUDNN_DNN_ROUTINE_EACH(__macro)             \
   __macro(cudnnSetTensor4dDescriptor);              \
   __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnSetTensorNdDescriptor);              \
+  __macro(cudnnGetTensorNdDescriptor);              \
   __macro(cudnnGetConvolutionNdForwardOutputDim);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);     \
   __macro(cudnnCreateTensorDescriptor);             \
   __macro(cudnnDestroyTensorDescriptor);            \
   __macro(cudnnCreateFilterDescriptor);             \
   __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetFilterNdDescriptor);              \
+  __macro(cudnnGetFilterNdDescriptor);              \
   __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnSetPoolingNdDescriptor);             \
+  __macro(cudnnGetPoolingNdDescriptor);             \
   __macro(cudnnDestroyFilterDescriptor);            \
   __macro(cudnnCreateConvolutionDescriptor);        \
   __macro(cudnnCreatePoolingDescriptor);            \
   __macro(cudnnDestroyPoolingDescriptor);           \
   __macro(cudnnSetConvolution2dDescriptor);         \
   __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnSetConvolutionNdDescriptor);         \
+  __macro(cudnnGetConvolutionNdDescriptor);         \
   __macro(cudnnCreate);                             \
   __macro(cudnnDestroy);                            \
   __macro(cudnnSetStream);                          \