From 82091035514c0ddeae2c18ff5f523a2647d59948 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 22 Dec 2017 13:43:25 +0800
Subject: [PATCH] follow comments and refine code

---
 paddle/gserver/layers/MKLPackedGemm.h         |  95 ---------
 .../layers/MKLPackedRecurrentLayer.cpp        | 191 ++----------------
 .../gserver/layers/MKLPackedRecurrentLayer.h  |  87 ++------
 paddle/gserver/layers/MKLPackedWeight.h       | 100 +++++++++
 4 files changed, 125 insertions(+), 348 deletions(-)
 delete mode 100644 paddle/gserver/layers/MKLPackedGemm.h
 create mode 100644 paddle/gserver/layers/MKLPackedWeight.h

diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h
deleted file mode 100644
index 91e2515e323..00000000000
--- a/paddle/gserver/layers/MKLPackedGemm.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/MathFunctions.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class MKLPackedGemm {
-protected:
-  real* weightPacked_;
-  real* weightTPacked_;
-  size_t weightHeight_;
-  size_t weightWidth_;
-
-public:
-  explicit MKLPackedGemm(MatrixPtr weight) {
-    weightHeight_ = weight->getHeight();
-    weightWidth_ = weight->getWidth();
-    weightPacked_ =
-        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
-    weightTPacked_ =
-        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     CblasNoTrans,
-                     1,
-                     weightWidth_,
-                     weightHeight_,
-                     1.0,
-                     weight->getData(),
-                     weightWidth_,
-                     weightPacked_);
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     CblasTrans,
-                     1,
-                     weightWidth_,
-                     weightHeight_,
-                     1.0,
-                     weight->getData(),
-                     weightWidth_,
-                     weightTPacked_);
-  }
-  void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) {
-    if (transW) {
-      cblas_sgemm_compute(CblasRowMajor,
-                          CblasNoTrans,
-                          CblasPacked,
-                          batch2->getHeight(),
-                          weightWidth_,
-                          weightHeight_,
-                          batch1->getData(),
-                          weightHeight_,
-                          weightTPacked_,
-                          weightWidth_,
-                          1,
-                          batch2->getData(),
-                          weightWidth_);
-    } else {
-      cblas_sgemm_compute(CblasRowMajor,
-                          CblasNoTrans,
-                          CblasPacked,
-                          batch2->getHeight(),
-                          weightWidth_,
-                          weightHeight_,
-                          batch1->getData(),
-                          weightHeight_,
-                          weightPacked_,
-                          weightWidth_,
-                          1,
-                          batch2->getData(),
-                          weightWidth_);
-    }
-  }
-  ~MKLPackedGemm() {
-    cblas_sgemm_free(weightPacked_);
-    cblas_sgemm_free(weightTPacked_);
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
index 6f455af91ed..bd3c4ceb5e0 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -20,188 +20,21 @@ REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
 
 bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize(), biasParameter_));
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
   }
-  reversed_ = config_.reversed();
-
-  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
-
   return true;
 }
 
-void MKLPackedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-}
-
-void MKLPackedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr MKLPackedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void MKLPackedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize(), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  output_.value->assign(*input.value);
-  if (bias_) {
-    output_.value->addBias(*bias_->getW(), 1);
-  }
-  if (!FLAGS_rnn_use_batch) {
-    forwardSequence(batchSize, numSequences, starts);
-  } else {
-    forwardBatch(batchSize, numSequences, starts);
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardSequence(int batchSize,
-                                              size_t numSequences,
-                                              const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
-
-  frameOutput_.reserve(batchSize);
-  for (int i = frameOutput_.size(); i < batchSize; ++i) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               getSize(),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              getSize(),
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
-  }
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) {
-  if (!reversed_) {
-    if (prevOutput_) {
-      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-    activation_->forward(frameOutput_[start]).check();
-
-    for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*frameOutput_[start + length - 1].value);
-    }
-  } else {
-    activation_->forward(frameOutput_[start + length - 1]).check();
-    for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-  }
-}
-
 void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  if (!FLAGS_rnn_use_batch) {
-    backwardSequence(batchSize, numSequences, starts);
-  } else {
-    backwardBatch(batchSize, numSequences, starts);
-  }
-
-  if (input.grad) {
-    input.grad->add(*output_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*output_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
-}
-
-void MKLPackedRecurrentLayer::backwardSequence(int batchSize,
-                                               size_t numSequences,
-                                               const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
-  }
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  if (!reversed_) {
-    for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i - 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start + 1, length - 1),
-          1,
-          1);
-    }
-  } else {
-    for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i + 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start + length - 1]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start, length - 1),
-          1,
-          1);
-    }
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
   }
 }
 
@@ -227,7 +60,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
             batchValue_->getBatchValue(n - 1, batch2->getHeight());
 
         // batch2->mul(*batch1, *weight_->getW(), 1, 1);
-        sgemm_packed_->compute(batch2, batch1);
+        packed_weight_->compute(batch2, batch1);
       }
 
 #pragma omp parallel for collapse(2)
@@ -272,7 +105,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
         // batch1->mul(*batch2, *weightT, 1, 1);
-        sgemm_packed_->compute(batch1, batch2, true);
+        packed_weightT_->compute(batch1, batch2);
       }
 
       if (backwardByBatch && weight_->getWGrad()) {
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index b8727e0ff30..ba6487b11e0 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include "Layer.h"
-#include "MKLPackedGemm.h"
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
 #include "SequenceToBatch.h"
 #include "paddle/utils/Stat.h"
 
@@ -45,90 +46,28 @@ namespace paddle {
  * them by rnn_use_batch flag.
  */
 
-class MKLPackedRecurrentLayer : public Layer {
+class MKLPackedRecurrentLayer : public RecurrentLayer {
 public:
-  explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType) override;
-
   void backward(const UpdateCallback& callback) override;
 
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
 protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
 
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
 
 protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  std::unique_ptr<MKLPackedGemm> sgemm_packed_;
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 00000000000..a8dcfd561b6
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  real *weight_;
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void compute(MatrixPtr dst, MatrixPtr src) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+  void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        M,
+                        width_,
+                        height_,
+                        A,
+                        lda,
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        C,
+                        ldc);
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
-- 
GitLab