diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 06ee3b3654b576ec57dc437582e37ed0cea328ee..c5787630abbe105af64888692b1106bd21f4c1e8 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -240,62 +240,6 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
-/**
- * @brief   Cross-map-respose normalize forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   in          input data.
- * @param[in]   scale       buffer.
- * @param[out]  out         output data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   sizeX       size.
- * @param[in]   alpha       scale.
- * @param[in]   beta        scale.
- *
- */
-extern void hl_CMRNorm_forward(size_t frameCnt,
-                               const real* in,
-                               real* scale,
-                               real* out,
-                               size_t channels,
-                               size_t height,
-                               size_t width,
-                               size_t sizeX,
-                               real alpha,
-                               real beta);
-
-/**
- * @brief   Cross-map-respose normalize backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inV         input data.
- * @param[in]   scale       buffer.
- * @param[out]  outV        output value.
- * @param[out]  outDiff     output grad.
- * @param[out]  inDiff      input grad.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   sizeX       size.
- * @param[in]   alpha       scale.
- * @param[in]   beta        scale.
- *
- */
-extern void hl_CMRNorm_backward(size_t frameCnt,
-                                const real* inV,
-                                const real* scale,
-                                const real* outV,
-                                const real* outDiff,
-                                real* inDiff,
-                                size_t channels,
-                                size_t height,
-                                size_t width,
-                                size_t sizeX,
-                                real alpha,
-                                real beta);
-
 /**
  * @brief   Bilinear interpolation forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 52c978735279ed804c44f0e93472355637e8b98d..039551c6cc69525e71c8c311f78fb6dec07d7fed 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -117,30 +117,6 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride) {}
 
-inline void hl_CMRNorm_forward(size_t frameCnt,
-                               const real* in,
-                               real* scale,
-                               real* out,
-                               size_t channels,
-                               size_t height,
-                               size_t width,
-                               size_t sizeX,
-                               real alpha,
-                               real beta) {}
-
-inline void hl_CMRNorm_backward(size_t frameCnt,
-                                const real* inV,
-                                const real* scale,
-                                const real* outV,
-                                const real* outDiff,
-                                real* inDiff,
-                                size_t channels,
-                                size_t height,
-                                size_t width,
-                                size_t sizeX,
-                                real alpha,
-                                real beta) {}
-
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
                                 const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 1516accaae17fbeff4f4e48584940ec3e9873897..b94f4d8fe4a251750c527d4b686fcc8f452d4606 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -381,126 +381,6 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
-                                   real* scale, size_t channels,
-                                   size_t height, size_t width, size_t size,
-                                   real alpha) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-
-    in += offset;
-    scale += offset;
-    const int step = height * width;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    real accum = 0;
-    int index = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += in[index * step] * in[index * step];
-      }
-      if (index >= size) {
-        accum -= in[(index - size) * step] * in[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        scale[(index - post_pad) * step] = 1. + accum * alpha;
-      }
-      ++index;
-    }
-  }
-}
-
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
-                                const real* scale, real negative_beta,
-                                real* out) {
-  const int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < inputSize) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale,
-                        real* out, size_t channels,
-                        size_t height, size_t width, size_t sizeX,
-                        real alpha, real beta) {
-  size_t imageSize = frameCnt * height * width;
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (imageSize, in, scale, channels, height, width, sizeX, alpha);
-
-  size_t inputSize = frameCnt * height * width *channels;
-  blockSize = 1024;
-  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-           (inputSize, in, scale, beta, out);
-  CHECK_SYNC("hl_CMRNorm_forward");
-}
-
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
-                              const real* top_data, const real* scale,
-                              const real* top_diff, size_t channels,
-                              size_t height, size_t width, size_t size,
-                              real negative_beta, real cache_ratio,
-                              real* bottom_diff ) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-
-    const int step = height * width;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    int index = 0;
-    real accum = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += top_diff[index * step] * top_data[index * step] /
-          scale[index * step];
-      }
-      if (index >= size) {
-        accum -= top_diff[(index - size) * step] *
-          top_data[(index - size) * step] / scale[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        bottom_diff[(index - post_pad) * step] +=
-          top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(index - post_pad) * step] * accum;
-      }
-      ++index;
-    }
-  }
-}
-
-void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
-                         const real* scale,
-                         const real* outV, const real* outDiff,
-                         real *inDiff, size_t channels,
-                         size_t height, size_t width, size_t sizeX,
-                         real alpha, real beta) {
-  size_t imageSize = frameCnt * height * width;
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (imageSize, inV, outV, scale, outDiff, channels,
-      height, width, sizeX, alpha, beta, inDiff);
-  CHECK_SYNC("hl_CMRNorm_backward");
-}
-
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index e69c40699305466261b5febf0588c94597e04d25..4ff3b805fbb061cbb2630330c1b9b4f6c2b354d6 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -110,34 +110,5 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
                    Tensor(denoms_->getData(), dims_)},
                   {Tensor(preOutGrad->getData(), dims_)},
                   {});
-#if 0
-  if (useGpu_) {
-    CrossMapNormalGrad<DEVICE_TYPE_GPU> crossGrad;
-    crossGrad(dynamic_cast<GpuMatrix&>(*preOutGrad),
-              dynamic_cast<GpuMatrix&>(*preOutV),
-              dynamic_cast<GpuMatrix&>(*localGrad),
-              dynamic_cast<GpuMatrix&>(*localOutV),
-              dynamic_cast<GpuMatrix&>(*denoms_),
-              channels_,
-              imgSizeH_,
-              imgSizeW_,
-              size_,
-              scale_,
-              pow_);
-  } else {
-    CrossMapNormalGrad<DEVICE_TYPE_CPU> crossGrad;
-    crossGrad(dynamic_cast<CpuMatrix&>(*preOutGrad),
-              dynamic_cast<CpuMatrix&>(*preOutV),
-              dynamic_cast<CpuMatrix&>(*localGrad),
-              dynamic_cast<CpuMatrix&>(*localOutV),
-              dynamic_cast<CpuMatrix&>(*denoms_),
-              channels_,
-              imgSizeH_,
-              imgSizeW_,
-              size_,
-              scale_,
-              pow_);
-  }
-#endif
 }
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 2cde11dd479dc0d150c5d7ce5c0c5c1cbf40e449..a36c31d32b7ca9d737a2b9df1934555cb490ee00 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1265,69 +1265,6 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       outGrad.getStride());
 }
 
-void GpuMatrix::crossMapNormalFwd(Matrix& input,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  Matrix& denoms,
-                                  size_t channels,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  size_t num = input.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-
-  CHECK(height * width * channels == input.getWidth());
-  CHECK(denoms.getHeight() == input.getHeight() &&
-        denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
-        input.getWidth() == width_);
-  hl_CMRNorm_forward(num,
-                     input.getData(),
-                     denoms.getData(),
-                     data_,
-                     channels,
-                     height,
-                     width,
-                     sizeX,
-                     scale,
-                     -pow);
-}
-
-void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
-                                  Matrix& denoms,
-                                  Matrix& preOutV,
-                                  Matrix& localOutV,
-                                  size_t channels,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  size_t num = preOutV.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-
-  CHECK(width * height * channels == preOutV.getWidth());
-  CHECK(denoms.getHeight() == preOutV.getHeight() &&
-        denoms.getWidth() == preOutV.getWidth() &&
-        preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
-  CHECK(denoms.getHeight() == localGrad.getHeight() &&
-        denoms.getWidth() == localGrad.getWidth());
-
-  hl_CMRNorm_backward(num,
-                      preOutV.getData(),
-                      denoms.getData(),
-                      localOutV.getData(),
-                      localGrad.getData(),
-                      data_,
-                      channels,
-                      height,
-                      width,
-                      sizeX,
-                      -pow,
-                      2.0f * pow * scale);
-}
-
 void GpuMatrix::maxSequenceForward(Matrix& input,
                                    const IVector& sequence,
                                    IVector& index) {
@@ -2219,119 +2156,6 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
   }
 }
 
-void CpuMatrix::crossMapNormalFwd(Matrix& input,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  Matrix& denoms,
-                                  size_t channels,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  CHECK(isContiguous());
-  CHECK(input.isContiguous());
-  CHECK(denoms.isContiguous());
-  CHECK_EQ(getHeight(), input.getHeight());
-  CHECK_EQ(getWidth(), input.getWidth());
-  CHECK_EQ(getHeight(), denoms.getHeight());
-  CHECK_EQ(getWidth(), denoms.getWidth());
-
-  size_t numSample = input.getHeight();
-  size_t numCols = input.getWidth();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-  CHECK(height * width * channels == numCols);
-
-  // TODO(hedaoyuan) After commit TensorExpress code,
-  // Reconstruction this code to remove the temporary memory.
-  CpuMatrix tmp(channels, height * width);
-  CpuMatrix tmp2(tmp.getData(), 1, channels * height * width);
-  denoms.zero();
-  const int start = -((int)sizeX - 1) / 2;
-  const int end = (int)sizeX + start;
-  for (size_t i = 0; i < numSample; i++) {
-    input.subMatrix(i, 1)->square2(tmp2);
-    CpuMatrix subDen(
-        denoms.subMatrix(i, 1)->getData(), channels, height * width);
-    for (int c = 0; c < (int)channels; c++) {
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          subDen.subMatrix(c, 1)->add(*tmp.subMatrix(c + s, 1));
-        }
-      }
-    }
-  }
-
-  denoms.add(scale, (real)1);
-  this->pow2(denoms, -pow);
-  this->dotMul(input);
-}
-
-void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
-                                  Matrix& denoms,
-                                  Matrix& preOutV,
-                                  Matrix& localOutV,
-                                  size_t channels,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t sizeX,
-                                  float scale,
-                                  float pow) {
-  CHECK(isContiguous());
-  CHECK(localGrad.isContiguous());
-  CHECK(denoms.isContiguous());
-  CHECK(preOutV.isContiguous());
-  CHECK(localOutV.isContiguous());
-  CHECK_EQ(getHeight(), localGrad.getHeight());
-  CHECK_EQ(getWidth(), localGrad.getWidth());
-  CHECK_EQ(getHeight(), denoms.getHeight());
-  CHECK_EQ(getWidth(), denoms.getWidth());
-  CHECK_EQ(getHeight(), preOutV.getHeight());
-  CHECK_EQ(getWidth(), preOutV.getWidth());
-  CHECK_EQ(getHeight(), localOutV.getHeight());
-  CHECK_EQ(getWidth(), localOutV.getWidth());
-
-  size_t numSample = getHeight();
-  size_t numCols = getWidth();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-  CHECK(height * width * channels == numCols);
-
-  // TODO(hedaoyuan) After commit TensorExpress code,
-  // Reconstruction this code to remove the temporary memory.
-  CpuMatrix tmp(1, height * width);
-
-  const int start = -((int)sizeX) / 2;
-  const int end = (int)sizeX + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSample; i++) {
-    CpuMatrix inputDiff(
-        this->subMatrix(i, 1)->getData(), channels, height * width);
-    CpuMatrix outDiff(
-        localGrad.subMatrix(i, 1)->getData(), channels, height * width);
-    CpuMatrix input(
-        preOutV.subMatrix(i, 1)->getData(), channels, height * width);
-    CpuMatrix output(
-        localOutV.subMatrix(i, 1)->getData(), channels, height * width);
-    CpuMatrix subDen(
-        denoms.subMatrix(i, 1)->getData(), channels, height * width);
-
-    for (int c = 0; c < (int)channels; c++) {
-      tmp.pow2(*subDen.subMatrix(c, 1), -pow);
-      inputDiff.subMatrix(c, 1)
-          ->addDotMul(tmp, *outDiff.subMatrix(c, 1), (real)1, (real)1);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          tmp.dotMul(*outDiff.subMatrix(c + s, 1), *output.subMatrix(c + s, 1));
-          tmp.mulScalar(ratio);
-          tmp.dotDiv(tmp, *subDen.subMatrix(c + s, 1));
-          tmp.dotMul(*input.subMatrix(c, 1));
-          inputDiff.subMatrix(c, 1)->add(tmp);
-        }
-      }
-    }
-  }
-}
-
 /**
  * Input: one or more sequences. Each sequence contains some instances.
  * Output: output size is the number of input sequences (NOT input instances).
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 5685cb7bcbbb6b90687790953d676e3792f36f36..62bc1b16fc7b6eedd2c6ac163b1d1d1c8de02ab5 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -952,31 +952,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  /// normalize-operation.
-  virtual void crossMapNormalFwd(Matrix& input,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 Matrix& denoms,
-                                 size_t channels,
-                                 size_t sizeX,
-                                 float scale,
-                                 float pow) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void crossMapNormalBwd(Matrix& localGrad,
-                                 Matrix& denoms,
-                                 Matrix& preOutV,
-                                 Matrix& localOutV,
-                                 size_t channels,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t size,
-                                 float scale,
-                                 float pow) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Input: one or more sequences. Each sequence contains some instances.
    *
@@ -1459,26 +1434,6 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
-  void crossMapNormalFwd(Matrix& input,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         Matrix& denoms,
-                         size_t channels,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad,
-                         Matrix& denoms,
-                         Matrix& preOutV,
-                         Matrix& localOutV,
-                         size_t channels,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1685,26 +1640,6 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
-  void crossMapNormalFwd(Matrix& input,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         Matrix& denoms,
-                         size_t channels,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad,
-                         Matrix& denoms,
-                         Matrix& preOutV,
-                         Matrix& localOutV,
-                         size_t channels,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t sizeX,
-                         float scale,
-                         float pow);
-
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index da7a585484e7ee1f6579732206d3cafa109bd3cb..c89b7ff490232f5876ceb5e928c9352c431203ec 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1385,33 +1385,6 @@ void testCrossMapNormalBwd(
              Tensor(denomsGpu.getData(), dims)},
             {Tensor(inputsGradGpu.getData(), dims)},
             {});
-#if 0
-  CrossMapNormalGrad<DEVICE_TYPE_CPU> cpuCross;
-  cpuCross(inputsGrad,
-           inputsValue,
-           outputsGrad,
-           outputsValue,
-           denoms,
-           channels,
-           imgSizeH,
-           imgSizeW,
-           sizeX,
-           scale,
-           pow);
-
-  CrossMapNormalGrad<DEVICE_TYPE_GPU> gpuCross;
-  gpuCross(inputsGradGpu,
-           inputsValueGpu,
-           outputsGradGpu,
-           outputsValueGpu,
-           denomsGpu,
-           channels,
-           imgSizeH,
-           imgSizeW,
-           sizeX,
-           scale,
-           pow);
-#endif
 
   TensorCheckErr(inputsGrad, inputsGradGpu);
 }