Update variable names and docs for factorization machine layer

7a1a5863 · wangmeng28 · e5135e8b · 7a1a5863 · 7a1a5863 · 7a1a5863
5 changed file
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,

  /* initialize the latentVectors_ */
  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t height = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
-  latentVectors_ =
-      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
-
-  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));

  return true;
 }
@@ -48,48 +46,49 @@ void FactorizationMachineLayer::forward(PassType passType) {
  const MatrixPtr& inputV = getInputValue(0);

  size_t batchSize = inputV->getHeight();
-  size_t size = getSize();
-  reserveOutput(batchSize, size);
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);

  MatrixPtr outV = getOutputValue();

-  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);

-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpMul_->square2(*tmpOut_);
+  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
  outV->sumRows(*tmpOut_, 0.5, 0);

-  x2_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
-    x2_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  inputSquare_ = inputV->clone(0, 0, useGpu_);
+  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
  } else {
-    inputV->square2(*x2_);
+    inputV->square2(*inputSquare_);
  }
-  latentVectors_->getW()->square2(*v2_);
-  tmpOut_->mul(*x2_, *v2_);
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
  outV->sumRows(*tmpOut_, -0.5, 1.0);

  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
    forwardActivation();
  }
 }

 void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
+  /* Do derivation */ { backwardActivation(); }

  const MatrixPtr& inputV = getInputValue(0);
  const MatrixPtr& oGrad = getOutputGrad();

-  MatrixPtr tmpSum =
-      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
                                         latentVectors_->getW()->getHeight(),
                                         1,
                                         false,
@@ -97,30 +96,35 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {

  /* Calculate the gradients of the latentVectors_ matrix */
  if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
-      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
-      tmpIn_s->copyFrom(*inputV_s);
-      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn_s->rowScale(0, *x2_s, *oGrad);
-
-      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
-      ones->zeroMem();
-      ones->add(-1);
-      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
    } else {
-      tmpIn->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn->rowScale(0, *x2_, *oGrad);
+      tmpInput->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput->rowScale(0, *inputSquare_, *oGrad);

-      tmpSum->sumCols(*tmpIn, -1, 0);
+      tmpSum_->sumCols(*tmpInput, -1, 0);
    }

    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSum_T);
+        0, *latentVectors_->getW(), *tmpSumTrans);

    /* Increasing the number of gradient */
    latentVectors_->getParameterPtr()->incUpdate(callback);
@@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
  /* Calculate the input layers gradient */
  MatrixPtr inGrad = getInputGrad(0);
  if (inGrad != NULL) {
-    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
-    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
-    tmpSum_T->sumRows(*v2_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
    inGrad->rowScale(0, *inGrad, *oGrad);
  }
 }

--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -34,27 +34,36 @@ namespace paddle {
 *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
 * \f]
 *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Rendle, Steffen. Factorization machines. IEEE 10th International
+ *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *
 * The config file api is factorization_machine.
 */

 class FactorizationMachineLayer : public Layer {
 protected:
-  /// The latent vectors, shape: (size, factorSize_)
-  /// Each row of the latentVectors_ matrix is the latent vector
-  /// corresponding to one input feature dimension
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
  std::unique_ptr<Weight> latentVectors_;
-  /// The hyperparameter that defines the dimensionality of the factorization
+  // The hyperparameter that defines the dimensionality of the factorization
  size_t factorSize_;

 private:
-  /// The result of input matrix * letent vector matrix that will be used in
-  /// both forward and backward step
-  MatrixPtr tmpMul_;
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Temporary calculation result store
  MatrixPtr tmpOut_;
-  /// Store the square values of the letent vectors matrix
-  MatrixPtr v2_;
-  /// Store the square values of input matrix
-  MatrixPtr x2_;
+  MatrixPrt tmpSum_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;

 public:
  explicit FactorizationMachineLayer(const LayerConfig& config)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
  for (auto useGpu : {false, true}) {
    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
  }
 }


--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {

 void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b.getHeight());
-  CHECK(width_ == b.getWidth());
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
  real* A = getValue();
  real* B = b.getValue();
  for (size_t i = 0; i < height_; i++) {
    size_t start = getRowStartIdx(i);
    size_t end = getRowStartIdx(i + 1);
-    CHECK(start == b.getRowStartIdx(i));
-    CHECK(end == b.getRowStartIdx(i + 1));
+    CHECK_EQ(start, b.getRowStartIdx(i));
+    CHECK_EQ(end, b.getRowStartIdx(i + 1));
    for (size_t j = start; j < end; j++) {
      A[j] = B[j] * c.getElement(i, cCol);
    }

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7161,16 +7161,26 @@ def factorization_machine(input,
    The Factorization Machine models pairwise feature interactions as inner
    product of the learned latent vectors corresponding to each input feature.
    The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse. In practice, usually order 2 feature
-    interactions are considered using Factorization Machine with the formula:
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
    .. math::
        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
    Note:
        X is the input vector with size n. V is the factor matrix. Each row of V
        is the latent vector corresponding to each input dimesion. The size of
        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+        Rendle, Steffen. Factorization machines. IEEE 10th International
+        Conference on Data Mining (ICDM). IEEE, 2010.
+
    .. code-block:: python
       factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
    :param input: The input layer.
    :type input: LayerOutput
    :param factor_size: The hyperparameter that defines the dimensionality of