diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 15799919fa137ca0e9021978a100c5fd25d23b75..fc003b7d6377d199c3859aa8b257de07478992af 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -266,25 +266,21 @@ template<int blockSize>
 __global__ void KeMatrixClassificationError(real* in_A,
                                             int* in_B,
                                             real* out_C,
-                                            int dimM,
                                             int dimN) {
   __shared__ real max_s[blockSize];
   __shared__ int max_l[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int tid = threadIdx.x;
-  int lmt = tid;
-  int index = 0;
-  real t;
+  const int tid = threadIdx.x;
+  const int rowId = blockIdx.x;
 
   max_s[tid] = -1e30f;
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    index = blockIdx.y*dimN + lmt;
-    t = in_A[index];
-    if (max_s[tid] < t) {
-      max_s[tid] = t;
-      max_l[tid] = lmt;
+  in_A += rowId * dimN;
+  real tmp;
+  for (int colId = tid; colId < dimN; colId += blockSize) {
+    tmp = in_A[colId];
+    if (max_s[tid] < tmp) {
+      max_s[tid] = tmp;
+      max_l[tid] = colId;
     }
-    lmt += blockSize;
   }
   __syncthreads();
 
@@ -300,7 +296,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
   __syncthreads();
 
   if (tid == 0) {
-    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
   }
 }
 
@@ -313,12 +309,9 @@ void hl_matrix_classification_error(real* A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  int blocksX = 1;
-  int blocksY = dimM;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, B_d, C_d, dimM, dimN);
+  // each sample is calculated by one block
+  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
+    (A_d, B_d, C_d, dimN);
   CHECK_SYNC("hl_matrix_classification_error");
 }
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7caade444b827f132daa5606c4829c79e3da98bc..fe8eacc2efbc5446e63ed40f914b62c70189010e 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1697,7 +1697,6 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-
 void testParamReluForward(int height, int width, int w_height,
                                                  int w_width) {
   MatrixPtr output = CpuMatrix::create(height, width, false, false);
@@ -1736,7 +1735,6 @@ TEST(Matrix, paramReluForward) {
   }
 }
 
-
 void testParamReluBackwardW(int height, int width, int w_height,
                                                    int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1775,7 +1773,6 @@ TEST(Matrix, paramReluBackwardW) {
   }
 }
 
-
 void testParamReluBackwardDiff(int height, int width, int w_height,
                                                       int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1819,6 +1816,36 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
+void testClassificationError(int numSamples, int dim) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(cpuOutput, cpuLabel);
+  gpuError->classificationError(gpuOutput, gpuLabel);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
+  check->copyFrom(*gpuError);
+  MatrixCheckEqual(*cpuError, *check);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
+    for (auto dim : {1, 10, 100, 1000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testClassificationError(numSamples, dim);
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);