diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 3bd8d7cb4c7c63fc4fcdc931cf6935b02dbf0824..f0f1738f305503ec262f0de90dc5c4111fd959d0 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
 
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t height = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
-  latentVectors_ =
-      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
-
-  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
 
   return true;
 }
@@ -48,79 +46,85 @@ void FactorizationMachineLayer::forward(PassType passType) {
   const MatrixPtr& inputV = getInputValue(0);
 
   size_t batchSize = inputV->getHeight();
-  size_t size = getSize();
-  reserveOutput(batchSize, size);
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
 
   MatrixPtr outV = getOutputValue();
 
-  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
   Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
 
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpMul_->square2(*tmpOut_);
+  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
-  x2_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
-    x2_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  inputSquare_ = inputV->clone(0, 0, useGpu_);
+  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
   } else {
-    inputV->square2(*x2_);
+    inputV->square2(*inputSquare_);
   }
-  latentVectors_->getW()->square2(*v2_);
-  tmpOut_->mul(*x2_, *v2_);
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 
   /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
     forwardActivation();
   }
 }
 
 void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
+  /* Do derivation */ { backwardActivation(); }
 
   const MatrixPtr& inputV = getInputValue(0);
   const MatrixPtr& oGrad = getOutputGrad();
 
-  MatrixPtr tmpSum =
-      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
-                                      latentVectors_->getW()->getHeight(),
-                                      1,
-                                      false,
-                                      useGpu_);
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
 
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
     if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
-      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
-      tmpIn_s->copyFrom(*inputV_s);
-      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn_s->rowScale(0, *x2_s, *oGrad);
-
-      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
-      ones->zeroMem();
-      ones->add(-1);
-      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
     } else {
-      tmpIn->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn->rowScale(0, *x2_, *oGrad);
+      tmpInput->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput->rowScale(0, *inputSquare_, *oGrad);
 
-      tmpSum->sumCols(*tmpIn, -1, 0);
+      tmpSum_->sumCols(*tmpInput, -1, 0);
     }
 
     latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSum_T);
+        0, *latentVectors_->getW(), *tmpSumTrans);
 
     /* Increasing the number of gradient */
     latentVectors_->getParameterPtr()->incUpdate(callback);
@@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
   /* Calculate the input layers gradient */
   MatrixPtr inGrad = getInputGrad(0);
   if (inGrad != NULL) {
-    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
-    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
-    tmpSum_T->sumRows(*v2_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
     inGrad->rowScale(0, *inGrad, *oGrad);
   }
 }
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 7cf064690ff8dd85e6c064a98a1f441036e9d38a..85d40fdb1ee39d9e9c644da03a0a584e2342963c 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -34,27 +34,36 @@ namespace paddle {
  *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
  * \f]
  *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Rendle, Steffen. Factorization machines. IEEE 10th International
+ *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *
  * The config file api is factorization_machine.
  */
 
 class FactorizationMachineLayer : public Layer {
 protected:
-  /// The latent vectors, shape: (size, factorSize_)
-  /// Each row of the latentVectors_ matrix is the latent vector
-  /// corresponding to one input feature dimension
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
   std::unique_ptr<Weight> latentVectors_;
-  /// The hyperparameter that defines the dimensionality of the factorization
+  // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
 private:
-  /// The result of input matrix * letent vector matrix that will be used in
-  /// both forward and backward step
-  MatrixPtr tmpMul_;
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Temporary calculation result store
   MatrixPtr tmpOut_;
-  /// Store the square values of the letent vectors matrix
-  MatrixPtr v2_;
-  /// Store the square values of input matrix
-  MatrixPtr x2_;
+  MatrixPrt tmpSum_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
 
 public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 072d75c23d64d5b5b56befa62164bb501588e8e0..04ff618c214274cffbd554f1cd029bb91e29892f 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   for (auto useGpu : {false, true}) {
     testFactorizationMachineLayer(INPUT_DATA, useGpu);
+    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
   }
 }
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index e211c23a7e670034af31a38883555bdcabf8b158..6a432cd16b727318a9cd2550632cd1ea2e90c66d 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
 
 void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
   CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b.getHeight());
-  CHECK(width_ == b.getWidth());
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
   real* A = getValue();
   real* B = b.getValue();
   for (size_t i = 0; i < height_; i++) {
     size_t start = getRowStartIdx(i);
     size_t end = getRowStartIdx(i + 1);
-    CHECK(start == b.getRowStartIdx(i));
-    CHECK(end == b.getRowStartIdx(i + 1));
+    CHECK_EQ(start, b.getRowStartIdx(i));
+    CHECK_EQ(end, b.getRowStartIdx(i + 1));
     for (size_t j = start; j < end; j++) {
       A[j] = B[j] * c.getElement(i, cCol);
     }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 30e334e7c8aa5608ff1bd1ca0345e80f5d8139f1..7e38383bd623c1cbf609f14607a50cdb785c2033 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7161,16 +7161,26 @@ def factorization_machine(input,
     The Factorization Machine models pairwise feature interactions as inner
     product of the learned latent vectors corresponding to each input feature.
     The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse. In practice, usually order 2 feature
-    interactions are considered using Factorization Machine with the formula:
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
     .. math::
         y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
     Note:
         X is the input vector with size n. V is the factor matrix. Each row of V
         is the latent vector corresponding to each input dimesion. The size of
         each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+        Rendle, Steffen. Factorization machines. IEEE 10th International
+        Conference on Data Mining (ICDM). IEEE, 2010.
+
     .. code-block:: python
        factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
     :param input: The input layer.
     :type input: LayerOutput
     :param factor_size: The hyperparameter that defines the dimensionality of