diff --git a/paddle/gserver/layers/NormalizeLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
similarity index 64%
rename from paddle/gserver/layers/NormalizeLayer.cpp
rename to paddle/gserver/layers/CrossChannelNormLayer.cpp
index 22df8adb4eecf912eeba87ff89d27c0fbc61ae14..ced719999154bc3957d9e171220e7b3f703a8336 100644
--- a/paddle/gserver/layers/NormalizeLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -13,53 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
+#include "NormLayer.h"
 #include "paddle/math/BaseMatrix.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
-/**
- * This layer applys normalize across the channels of each sample to a
- * conv layer's output and scale the output by a group of trainable factors
- * which dimensions equal to the channel's number.
- * - Input: One and only one input layer are accepted. The input layer must be
- *        be a data output layer.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
 
-class NormalizeLayer : public Layer {
-public:
-  explicit NormalizeLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-
-protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-REGISTER_LAYER(normalize, NormalizeLayer);
-
-bool NormalizeLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  channels_ = config_.num_filters();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void NormalizeLayer::forward(PassType passType) {
+void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
   auto in = getInput(0);
   MatrixPtr inV = getInputValue(0);
@@ -74,16 +34,12 @@ void NormalizeLayer::forward(PassType passType) {
 
   Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
   normBuffer_->zeroMem();
   spatialBuffer_->zeroMem();
-  sampleBuffer_->zeroMem();
   dataBuffer_->zeroMem();
   // add eps to avoid overflow
   normBuffer_->addScalar(*normBuffer_, 1e-6);
-  channelBuffer_->resetOne();
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
     spatialBuffer_->zeroMem();
@@ -102,18 +58,14 @@ void NormalizeLayer::forward(PassType passType) {
     spatialBuffer_->sumCols(*dataTmp, 1, 1);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-    sampleBuffer_->dotDiv(*inTmp, *sampleBuffer_);
-    outTmp->copyFrom(*sampleBuffer_);
-
+    outTmp->copyFrom(*inTmp);
+    outTmp->divRowVector(*spatialBuffer_);
     // scale the layer.
-    spatialBuffer_->resetOne();
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    outTmp->dotMul(*outTmp, *sampleBuffer_);
+    outTmp->mulColVector(*scale_->getW());
   }
 }
 
-void NormalizeLayer::backward(const UpdateCallback& callback) {
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inG = getInputGrad(0);
   MatrixPtr inV = getInputValue(0);
   MatrixPtr outG = getOutputGrad();
@@ -124,9 +76,10 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
   size_t dataDim = inG->getWidth();
   size_t spatialDim = dataDim / channels_;
 
-  bool syncFlag = hl_get_sync_flag();
   dataBuffer_->dotMul(*outG, *outV);
   Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   scaleDiff_->zeroMem();
   for (size_t i = 0; i < batchSize; i++) {
     spatialBuffer_->zeroMem();
@@ -154,28 +107,20 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
     sampleBuffer_->dotMul(*inValueTmp, *outGradTmp);
     spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
     // scale the grad
-    channelBuffer_->resetOne();
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-
-    inGradTmp->dotMul(*inValueTmp, *sampleBuffer_);
+    inGradTmp->copyFrom(*inValueTmp);
+    inGradTmp->mulRowVector(*spatialBuffer_);
     // divide by square of norm
     spatialBuffer_->dotMul(*normTmp, *normTmp);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    inGradTmp->divRowVector(*spatialBuffer_);
     // subtract
     inGradTmp->add(*outGradTmp, -1, 1);
     // divide by norm
-    sampleBuffer_->mul(*channelBuffer_, *normTmp, 1., 0.);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    inGradTmp->divRowVector(*normTmp);
     // scale the diff
-    spatialBuffer_->resetOne();
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    inGradTmp->dotMul(*inGradTmp, *sampleBuffer_);
+    inGradTmp->mulColVector(*scale_->getW());
   }
   // updata scale
   if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
-  hl_set_sync_flag(false);
-  hl_set_sync_flag(syncFlag);
   scale_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 3db0af2515ee9f64aa6c0b0a441e88562d9e398e..e094078bfe86e30c06e1b80ebc04c8213fe9abcf 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
     return new ResponseNormLayer(config);
   } else if (norm == "cmrnorm-projection") {
     return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
   } else {
     LOG(FATAL) << "Unknown norm type: " << norm;
     return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index e77faaa322570933b3ea2de877b7859857306432..59ba226dfe5f5f96ce0d5a97c05851c60cace287 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
   }
 };
 
+/**
+ * This layer applys normalize across the channels of each sample to a
+ * conv layer's output and scale the output by a group of trainable factors
+ * which dimensions equal to the channel's number.
+ * - Input: One and only one input layer are accepted. The input layer must be
+ *        be a data output layer.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a7d3eaeaf98cb017a4ca9e81e1f58bfd17335eb0..7afaf87189256f27b72f41f10c3f6efe742eb9e4 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1623,17 +1623,22 @@ TEST(Layer, PadLayer) {
   }
 }
 
-TEST(Layer, NormalizeLayer) {
+TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
-  config.layerConfig.set_type("normalize");
+  config.layerConfig.set_type("norm");
   config.layerConfig.set_size(100);
-  config.layerConfig.set_num_filters(10);
-
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-  config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "normalize", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
   }
 }
 
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 0a0d92d1ae65f5b6020eb71fe2a6db5a3c625d9c..de48b6fac9c7d8125a552022c52353ef6bcef995 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
               true_type() /* bAsRowVector */, false_type());
 }
 
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8691c87ac3b88499a9676d59af533e0f4713dfc3..6ed48c8d88ee698689de6f7a7f470b97a094ea5b 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
   void mulRowVector(BaseMatrixT& b);
   void divRowVector(BaseMatrixT& b);
 
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
   void addP2P(BaseMatrixT& b);
 
   /**
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 21918b86e1ad98766ceaf09dea3020d6e8592191..22ce39701fca7b650fc03794cb0701e0987d2dae 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
       compare(&BaseMatrix::addRowVector);
       compare(&BaseMatrix::mulRowVector);
       compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
       compare(&BaseMatrix::addP2P);
       compare(&BaseMatrix::invSqrt);
     }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c52039219575936414fb17a67f84bd1422035b98..3e6a73dcf868c92426e211f6cae415e2afcf1b8e 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1156,9 +1156,11 @@ def parse_image(image, input_layer_name, image_conf):
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
-                  norm.norm_type)
+    config_assert(
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1619,16 +1621,6 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
-@config_layer('normalize')
-class NormalizeLayer(LayerBase):
-    def __init__(self, name, inputs, size, num_filters, **xargs):
-        super(NormalizeLayer, self).__init__(name, 'normalize', 0, inputs,
-                                             **xargs)
-        self.config.size = size
-        self.config.num_filters = num_filters
-        self.create_input_parameter(0, num_filters, [num_filters, 1])
-
-
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
@@ -1831,6 +1823,9 @@ class NormLayer(LayerBase):
                        norm_conf)
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 
 
 @config_layer('pool')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 1541b532d950a22a5d2e9928626d1b7a047c1fe1..b6a94264765b7f41d1d033ead5bc891569a93974 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -111,7 +111,7 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
-    'normalize_layer',
+    'cross_channel_norm_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -185,7 +185,6 @@ class LayerType(object):
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
-    NORMALIZE_LAYER = "normalize"
 
     CTC_LAYER = "ctc"
     WARP_CTC_LAYER = "warp_ctc"
@@ -1000,8 +999,8 @@ def priorbox_layer(input,
         size=size)
 
 
-@wrap_name_default("normalize")
-def normalize_layer(input, name=None, param_attr=None):
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
     Normalize a layer's output. This layer is necessary for ssd.
     This layer applys normalize across the channels of each sample to
@@ -1017,13 +1016,22 @@ def normalize_layer(input, name=None, param_attr=None):
     """
     Layer(
         name=name,
-        type=LayerType.NORMALIZE_LAYER,
-        inputs=[Input(input.name, **param_attr.attr)],
-        size=input.size,
-        num_filters=input.num_filters)
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
     return LayerOutput(
         name,
-        LayerType.NORMALIZE_LAYER,
+        LayerType.NORM_LAYER,
         parents=input,
         num_filters=input.num_filters,
         size=input.size)