update cuda kernel.

7da1db05 · dangqingqing · da7b9a5e · 7da1db05 · 7da1db05
隐藏空白更改
内联并排

Showing with 29 addition and 30 deletion

paddle/cuda/src/hl_batch_norm.cu paddle/cuda/src/hl_batch_norm.cu +14 -16

paddle/gserver/layers/CudnnBatchNormLayer.cpp paddle/gserver/layers/CudnnBatchNormLayer.cpp +15 -14

未找到文件。
--- a/paddle/cuda/src/hl_batch_norm.cu
+++ b/paddle/cuda/src/hl_batch_norm.cu
@@ -25,11 +25,11 @@ __global__ void batchNormInference(real* output,
                                   size_t channel,
                                   size_t height,
                                   size_t width) {
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = threadIdx.x;
  const int num = channel * height * width;
-  const int batch = blockIdx.y;
+  const int batch = blockIdx.x;
  for (int i = tid; i < num; i += blockDim.x) {
-    const int c = (i / (height * width)) % channel;
+    const int c = i / (height * width);
    const int id = batch * num + i;
    real val = input[id] - estimatedMean[c];
    val /= sqrt(estimatedVar[c] + epsilon);
@@ -50,19 +50,17 @@ void hl_batch_norm_cuda_inference(const real* input,
                                  size_t channel,
                                  size_t height,
                                  size_t width) {
-  dim3 block(256, 1);
+  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
-  dim3 grid(1, batchSize);
+                                                            input,
-  batchNormInference<<<grid, block, 0, STREAM_DEFAULT>>>(output,
+                                                            scale,
-                                                         input,
+                                                            bias,
-                                                         scale,
+                                                            estimatedMean,
-                                                         bias,
+                                                            estimatedVar,
-                                                         estimatedMean,
+                                                            epsilon,
-                                                         estimatedVar,
+                                                            batchSize,
-                                                         epsilon,
+                                                            channel,
-                                                         batchSize,
+                                                            height,
-                                                         channel,
+                                                            width);
-                                                         height,
-                                                         width);
  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
 }
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -80,9 +80,21 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   savedInvVar);
  } else {
    // used movingMean and movingVar in testing
-    if (batchSize > 1024) {
+    if (batchSize <= 1024) {
-      // there is a bug in cudnn library when the batch size
+      hl_batch_norm_forward_inference(ioDesc_,
-      // is larger than 1024.
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      EPS);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
      hl_batch_norm_cuda_inference(input,
                                   output,
                                   gamma,
@@ -94,17 +106,6 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   channels_,
                                   imageH_,
                                   imageW_);
-    } else {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      EPS);
    }
  }