Change Normalize layer to CrossChannelNorm layer

eb43d93a · gaoyuan · eea0097d · eb43d93a · eb43d93a · eb43d93a
9 changed file
--- a/paddle/gserver/layers/NormalizeLayer.cpp
+++ b/paddle/gserver/layers/NormalizeLayer.cpp
@@ -13,53 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
+#include "NormLayer.h"
 #include "paddle/math/BaseMatrix.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
-/**
- * This layer applys normalize across the channels of each sample to a
- * conv layer's output and scale the output by a group of trainable factors
- * which dimensions equal to the channel's number.
- * - Input: One and only one input layer are accepted. The input layer must be
- *        be a data output layer.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class NormalizeLayer : public Layer {
+void CrossChannelNormLayer::forward(PassType passType) {
-public:
-  explicit NormalizeLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-REGISTER_LAYER(normalize, NormalizeLayer);
-bool NormalizeLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  channels_ = config_.num_filters();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-void NormalizeLayer::forward(PassType passType) {
  Layer::forward(passType);
  auto in = getInput(0);
  MatrixPtr inV = getInputValue(0);
@@ -74,16 +34,12 @@ void NormalizeLayer::forward(PassType passType) {
  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
  normBuffer_->zeroMem();
  spatialBuffer_->zeroMem();
-  sampleBuffer_->zeroMem();
  dataBuffer_->zeroMem();
  // add eps to avoid overflow
  normBuffer_->addScalar(*normBuffer_, 1e-6);
-  channelBuffer_->resetOne();
  inV->square2(*dataBuffer_);
  for (size_t i = 0; i < batchSize; i++) {
    spatialBuffer_->zeroMem();
@@ -102,18 +58,14 @@ void NormalizeLayer::forward(PassType passType) {
    spatialBuffer_->sumCols(*dataTmp, 1, 1);
    spatialBuffer_->sqrt2(*spatialBuffer_);
    normTmp->copyFrom(*spatialBuffer_);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+    outTmp->copyFrom(*inTmp);
-    sampleBuffer_->dotDiv(*inTmp, *sampleBuffer_);
+    outTmp->divRowVector(*spatialBuffer_);
-    outTmp->copyFrom(*sampleBuffer_);
    // scale the layer.
-    spatialBuffer_->resetOne();
+    outTmp->mulColVector(*scale_->getW());
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    outTmp->dotMul(*outTmp, *sampleBuffer_);
  }
 }
-void NormalizeLayer::backward(const UpdateCallback& callback) {
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inG = getInputGrad(0);
  MatrixPtr inV = getInputValue(0);
  MatrixPtr outG = getOutputGrad();
@@ -124,9 +76,10 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
  size_t dataDim = inG->getWidth();
  size_t spatialDim = dataDim / channels_;
-  bool syncFlag = hl_get_sync_flag();
  dataBuffer_->dotMul(*outG, *outV);
  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
  scaleDiff_->zeroMem();
  for (size_t i = 0; i < batchSize; i++) {
    spatialBuffer_->zeroMem();
@@ -154,28 +107,20 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
    sampleBuffer_->dotMul(*inValueTmp, *outGradTmp);
    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
    // scale the grad
-    channelBuffer_->resetOne();
+    inGradTmp->copyFrom(*inValueTmp);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+    inGradTmp->mulRowVector(*spatialBuffer_);
-    inGradTmp->dotMul(*inValueTmp, *sampleBuffer_);
    // divide by square of norm
    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+    inGradTmp->divRowVector(*spatialBuffer_);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
    // subtract
    inGradTmp->add(*outGradTmp, -1, 1);
    // divide by norm
-    sampleBuffer_->mul(*channelBuffer_, *normTmp, 1., 0.);
+    inGradTmp->divRowVector(*normTmp);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
    // scale the diff
-    spatialBuffer_->resetOne();
+    inGradTmp->mulColVector(*scale_->getW());
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    inGradTmp->dotMul(*inGradTmp, *sampleBuffer_);
  }
  // updata scale
  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
-  hl_set_sync_flag(false);
-  hl_set_sync_flag(syncFlag);
  scale_->getParameterPtr()->incUpdate(callback);
 }

--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
    return new ResponseNormLayer(config);
  } else if (norm == "cmrnorm-projection") {
    return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
  } else {
    LOG(FATAL) << "Unknown norm type: " << norm;
    return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  return true;
 }
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
 }  // namespace paddle
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
  }
 };
+/**
+ * This layer applys normalize across the channels of each sample to a
+ * conv layer's output and scale the output by a group of trainable factors
+ * which dimensions equal to the channel's number.
+ * - Input: One and only one input layer are accepted. The input layer must be
+ *        be a data output layer.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1623,17 +1623,22 @@ TEST(Layer, PadLayer) {
  }
 }
-TEST(Layer, NormalizeLayer) {
+TEST(Layer, CrossChannelNormLayer) {
  TestConfig config;
-  config.layerConfig.set_type("normalize");
+  config.layerConfig.set_type("norm");
  config.layerConfig.set_size(100);
-  config.layerConfig.set_num_filters(10);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "normalize", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
  }
 }

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
              true_type() /* bAsRowVector */, false_type());
 }
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
  void mulRowVector(BaseMatrixT& b);
  void divRowVector(BaseMatrixT& b);
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
  void addP2P(BaseMatrixT& b);
  /**

--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
      compare(&BaseMatrix::addRowVector);
      compare(&BaseMatrix::mulRowVector);
      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
      compare(&BaseMatrix::addP2P);
      compare(&BaseMatrix::invSqrt);
    }

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1156,9 +1156,11 @@ def parse_image(image, input_layer_name, image_conf):
 def parse_norm(norm, input_layer_name, norm_conf):
    norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
+    config_assert(
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
+        norm.norm_type in
-                  norm.norm_type)
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
    norm_conf.channels = norm.channels
    norm_conf.size = norm.size
    norm_conf.scale = norm.scale
@@ -1619,16 +1621,6 @@ class PriorBoxLayer(LayerBase):
        self.config.size = size
-@config_layer('normalize')
-class NormalizeLayer(LayerBase):
-    def __init__(self, name, inputs, size, num_filters, **xargs):
-        super(NormalizeLayer, self).__init__(name, 'normalize', 0, inputs,
-                                             **xargs)
-        self.config.size = size
-        self.config.num_filters = num_filters
-        self.create_input_parameter(0, num_filters, [num_filters, 1])
 @config_layer('data')
 class DataLayer(LayerBase):
    def __init__(self, name, size, height=None, width=None, device=None):
@@ -1831,6 +1823,9 @@ class NormLayer(LayerBase):
                       norm_conf)
            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                               norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 @config_layer('pool')

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -111,7 +111,7 @@ __all__ = [
    'out_prod_layer',
    'print_layer',
    'priorbox_layer',
-    'normalize_layer',
+    'cross_channel_norm_layer',
    'spp_layer',
    'pad_layer',
    'eos_layer',
@@ -185,7 +185,6 @@ class LayerType(object):
    PRINT_LAYER = "print"
    PRIORBOX_LAYER = "priorbox"
-    NORMALIZE_LAYER = "normalize"
    CTC_LAYER = "ctc"
    WARP_CTC_LAYER = "warp_ctc"
@@ -1000,8 +999,8 @@ def priorbox_layer(input,
        size=size)
-@wrap_name_default("normalize")
+@wrap_name_default("cross_channel_norm")
-def normalize_layer(input, name=None, param_attr=None):
+def cross_channel_norm_layer(input, name=None, param_attr=None):
    """
    Normalize a layer's output. This layer is necessary for ssd.
    This layer applys normalize across the channels of each sample to
@@ -1017,13 +1016,22 @@ def normalize_layer(input, name=None, param_attr=None):
    """
    Layer(
        name=name,
-        type=LayerType.NORMALIZE_LAYER,
+        type=LayerType.NORM_LAYER,
-        inputs=[Input(input.name, **param_attr.attr)],
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
                    size=input.size,
-        num_filters=input.num_filters)
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
    return LayerOutput(
        name,
-        LayerType.NORMALIZE_LAYER,
+        LayerType.NORM_LAYER,
        parents=input,
        num_filters=input.num_filters,
        size=input.size)