update cuda kernel.

7da1db05 · dangqingqing · da7b9a5e · 7da1db05 · 7da1db05
显示空白变更内容
内联并排

Showing with 29 addition and 30 deletion

paddle/cuda/src/hl_batch_norm.cu paddle/cuda/src/hl_batch_norm.cu +14 -16

paddle/gserver/layers/CudnnBatchNormLayer.cpp paddle/gserver/layers/CudnnBatchNormLayer.cpp +15 -14

未找到文件。
--- a/paddle/cuda/src/hl_batch_norm.cu
+++ b/paddle/cuda/src/hl_batch_norm.cu
@@ -25,11 +25,11 @@ __global__ void batchNormInference(real* output,
                                   size_t channel,
                                   size_t height,
                                   size_t width) {
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = threadIdx.x;
  const int num = channel * height * width;
-  const int batch = blockIdx.y;
+  const int batch = blockIdx.x;
  for (int i = tid; i < num; i += blockDim.x) {
-    const int c = (i / (height * width)) % channel;
+    const int c = i / (height * width);
    const int id = batch * num + i;
    real val = input[id] - estimatedMean[c];
    val /= sqrt(estimatedVar[c] + epsilon);
@@ -50,9 +50,7 @@ void hl_batch_norm_cuda_inference(const real* input,
                                  size_t channel,
                                  size_t height,
                                  size_t width) {
-  dim3 block(256, 1);
+  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
-  dim3 grid(1, batchSize);
-  batchNormInference<<<grid, block, 0, STREAM_DEFAULT>>>(output,
                                                            input,
                                                            scale,
                                                            bias,

--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -80,31 +80,32 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   savedInvVar);
  } else {
    // used movingMean and movingVar in testing
-    if (batchSize > 1024) {
+    if (batchSize <= 1024) {
-      // there is a bug in cudnn library when the batch size
+      hl_batch_norm_forward_inference(ioDesc_,
-      // is larger than 1024.
+                                      input,
-      hl_batch_norm_cuda_inference(input,
+                                      ioDesc_,
                                      output,
+                                      bnParamDesc_,
                                      gamma,
                                      beta,
                                      movingMean,
                                      movingVar,
-                                   EPS,
+                                      EPS);
-                                   batchSize,
-                                   channels_,
-                                   imageH_,
-                                   imageW_);
    } else {
-      hl_batch_norm_forward_inference(ioDesc_,
+      // There is a limitation in cudnn library.
-                                      input,
+      // When the batch size is larger than 1024 in cuDNN v5.1,
-                                      ioDesc_,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
                                   output,
-                                      bnParamDesc_,
                                   gamma,
                                   beta,
                                   movingMean,
                                   movingVar,
-                                      EPS);
+                                   EPS,
+                                   batchSize,
+                                   channels_,
+                                   imageH_,
+                                   imageW_);
    }
  }