diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 85f7f535dcfa4262f4a4ef62213deaf23858d5cc..37f8808605e10c7c0e6f88f6fec7b5f20697fbaf 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -498,15 +498,10 @@ public:
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
-    /// todo(tianbing), support SparseMatrixArg for out_mat
     auto out_mat = outputs[0].matrix<Device>();
-    LOG(INFO) << "out_mat:";
-    out_mat.print(std::cout);
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg()) {
-      LOG(INFO) << "in1_mat:";
-      inputs[0].matrix<Device>().print(std::cout);
-      LOG(INFO) << "in2_mat:";
-      inputs[1].matrix<Device>().print(std::cout);
+    /// matrix = matrix * matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
       MulOp<Device>(out_mat,
                     inputs[0].matrix<Device>(),
                     inputs[1].matrix<Device>(),
@@ -515,11 +510,9 @@ public:
       return;
     }
 
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg()) {
-      LOG(INFO) << "in1_mat:";
-      inputs[0].matrix<Device>().print(std::cout);
-      LOG(INFO) << "in2_mat:";
-      inputs[1].sparse().SparseMatrix<Device>().print(std::cout);
+    /// matrix = matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
       MulOp<Device>(out_mat,
                     inputs[0].matrix<Device>(),
                     inputs[1].sparse().SparseMatrix<Device>(),
@@ -528,11 +521,9 @@ public:
       return;
     }
 
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg()) {
-      LOG(INFO) << "in1_mat:";
-      inputs[0].sparse().SparseMatrix<Device>().print(std::cout);
-      LOG(INFO) << "in2_mat:";
-      inputs[1].matrix<Device>().print(std::cout);
+    /// matrix = sparse matrix * matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
       MulOp<Device>(out_mat,
                     inputs[0].sparse().SparseMatrix<Device>(),
                     inputs[1].matrix<Device>(),
@@ -540,6 +531,18 @@ public:
                     beta_);
       return;
     }
+
+    /// sparse matrix = matrix * matrix
+    auto out_sparse_mat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(out_sparse_mat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    alpha_,
+                    beta_);
+      return;
+    }
   }
 
 private:
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
index 3691c7f3206126e5d40fec915edc5bad80487b14..3c4654b9b27574fd146d27e66164254f5e40da7d 100644
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -176,7 +176,36 @@ void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
                             const GpuMatrix& b,
                             real scale_ab,
                             real scale_t) {
-/// todo(tianbing), implement it
+  /// todo(tianbing), clean the code
+  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
+  CHECK(!out.trans_) << "trans not supported";
+  real* a_data = const_cast<real*>(a.getData());
+  real* b_data = const_cast<real*>(b.getData());
+  hl_sparse_matrix_s out_data = out.sMatrix_.get();
+  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+
+  if (!a.trans_ && !b.trans_) {
+    CHECK(out.height_ == a.getHeight());
+    CHECK(out.width_ == b.getWidth());
+    CHECK(a.getWidth() == b.getHeight());
+  } else if (a.trans_ && !b.trans_) {
+    CHECK(out.height_ == a.getWidth());
+    CHECK(out.width_ == b.getWidth());
+    CHECK(a.getHeight() == b.getHeight());
+  } else if (!a.trans_ && b.trans_) {
+    CHECK(out.height_ == a.getHeight());
+    CHECK(out.width_ == b.getHeight());
+    CHECK(a.getWidth() == b.getWidth());
+  } else {
+    LOG(INFO) << "Not support";
+  }
+  int dim_m = out.height_;
+  int dim_n = out.width_;
+  int dim_k = !b.trans_ ? b.getHeight() : b.getWidth();
+  hl_sparse_matrix_mul(
+      a_data, a_trans, b_data, b_trans, out_data,
+      dim_m, dim_n, dim_k, scale_ab, scale_t);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index fd02504678efd62912788a78b017ffb6d6530f78..630070b845a9af7aff734ea3e8ff9a7cf62fd7d3 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -24,9 +24,10 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 /**
- *  C = alpha * C + beta * (A * B)
+ *  C = alpha * C + beta * (A * B), A, B, C dense matrix
+ *  dense = dense * dense
  */
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+void testDDDMatrix(bool transa, bool transb, int dimM, int dimN, int dimK) {
   real alpha = 1.5;
   real beta = 2.0;
 
@@ -73,7 +74,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   autotest::TensorCheckErr(*cpuC, *gpuC);
 }
 
-TEST(Matrix, mul) {
+TEST(Matrix, DDDMul) {
   LOG(INFO) << "test for dense = dense * dense matrix";
   for (auto transa : {false, true}) {
     for (auto transb : {false, true}) {
@@ -89,7 +90,7 @@ TEST(Matrix, mul) {
                     << " dimN=" << std::setw(5) << dimN
                     << " dimK=" << std::setw(5) << dimK;
 
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
+            testDDDMatrix(transa, transb, dimM, dimN, dimK);
           }
         }
       }
@@ -97,19 +98,100 @@ TEST(Matrix, mul) {
   }
 }
 
-struct MatrixPara {
-  size_t height;
-  size_t width;
-  bool trans;
-  bool sparse;
-  size_t nnz;
-  SparseFormat format;
-};
+/**
+  * C += A * B, B, C dense, A sparse
+  * dense = sparse * dense
+  */
+void testDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real alpha = 1.0;
+  real beta = 1.0;
+  const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
+  cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
+  gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+
+  CpuSparseMatrix cpuMatrixA(dimM, dimK, nnz, FLOAT_VALUE, FORMAT, false);
+  GpuSparseMatrix gpuMatrixA(dimM, dimK, nnz, FLOAT_VALUE, FORMAT, false);
+  CpuMatrix cpuDenseA(dimM, dimK, false);
+
+  auto cpuMatrixB = Matrix::create(dimK, dimN, false, false);
+  auto gpuMatrixB = Matrix::create(dimK, dimN, false, true);
+  auto cpuDenseB = Matrix::create(dimK, dimN, false, false);
+
+  auto cpuMatrixC = Matrix::create(dimM, dimN, false, false);
+  auto gpuMatrixC = Matrix::create(dimM, dimN, false, true);
+  auto cpuDenseC = Matrix::create(dimM, dimN, false, false);
+
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA.randomizeUniform();
+  cpuMatrixB->randomizeUniform();
+  cpuMatrixC->randomizeUniform();
+
+  gpuMatrixA.copyFrom(cpuMatrixA, stream);
+  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
+  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
+
+  cpuDenseA.copyFrom(cpuMatrixA);
+  cpuDenseB->copyFrom(*cpuMatrixB);
+  cpuDenseC->copyFrom(*cpuMatrixC);
+  hl_stream_synchronize(stream);
+
+  /*matrix mul*/
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  cpuInputs.addArg(cpuMatrixA);
+  cpuInputs.addArg(*cpuMatrixB);
+  cpuOutputs.addArg(*cpuMatrixC, ADD_TO);
+  cpuFunc->calc(cpuInputs, cpuOutputs);
+
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  gpuInputs.addArg(gpuMatrixA);
+  gpuInputs.addArg(*gpuMatrixB);
+  gpuOutputs.addArg(*gpuMatrixC, ADD_TO);
+  gpuFunc->calc(gpuInputs, gpuOutputs);
+
+  BufferArgs denseInputs;
+  BufferArgs denseOutputs;
+  denseInputs.addArg(cpuDenseA);
+  denseInputs.addArg(*cpuDenseB);
+  denseOutputs.addArg(*cpuDenseC, ADD_TO);
+  cpuFunc->calc(denseInputs, denseOutputs);
+
+  /*check result*/
+  autotest::TensorCheckErr(*cpuMatrixC, *cpuDenseC);
+  autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC);
+}
+
+TEST(Matrix, DSparseDMul) {
+  LOG(INFO) << "test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
 
 /**
   * C += A * B, A, C dense, B sparse
+  * dense = dense * sparse
   */
-void testDSparseDMatrix() {
+void testDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real alpha = 1.0;
   real beta = 1.0;
   const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
@@ -117,46 +199,19 @@ void testDSparseDMatrix() {
   const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
   gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
 
-  constexpr size_t dimM = 2;
-  constexpr size_t dimN = 2;
-  constexpr size_t dimK = 3;
-  constexpr size_t NNZ = 3;
-  constexpr SparseFormat FORMAT = SPARSE_CSC;
-
-  MatrixPara paraA{dimM, dimK, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT};
-  MatrixPara paraB{dimK, dimN, /*trans*/ false, /*sparse*/ true, NNZ, FORMAT};
-  MatrixPara paraC{dimM, dimN, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT};
-
-  auto cpuMatrixA =
-      Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-  auto gpuMatrixA =
-      Matrix::create(paraA.height, paraA.width, paraA.trans, true);
-  auto cpuDenseA =
-      Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-  CpuSparseMatrix cpuMatrixB(paraB.height,
-                             paraB.width,
-                             paraB.nnz,
-                             FLOAT_VALUE,
-                             paraB.format,
-                             paraB.trans);
-
-  GpuSparseMatrix gpuMatrixB(paraB.height,
-                             paraB.width,
-                             paraB.nnz,
-                             FLOAT_VALUE,
-                             paraB.format,
-                             paraB.trans);
-
-  auto cpuDenseB =
-      Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-  auto cpuMatrixC =
-      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  auto gpuMatrixC =
-      Matrix::create(paraC.height, paraC.width, paraC.trans, true);
-  auto cpuDenseC =
-      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  auto gpuMatrixC_d2h =
-      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  auto cpuMatrixA = Matrix::create(dimM, dimK, false, false);
+  auto gpuMatrixA = Matrix::create(dimM, dimK, false, true);
+  auto cpuDenseA = Matrix::create(dimM, dimK, false, false);
+
+  CpuSparseMatrix cpuMatrixB(dimK, dimN, nnz, FLOAT_VALUE, FORMAT, false);
+
+  GpuSparseMatrix gpuMatrixB(dimK, dimN, nnz, FLOAT_VALUE, FORMAT, false);
+
+  auto cpuDenseB = Matrix::create(dimK, dimN, false, false);
+  auto cpuMatrixC = Matrix::create(dimM, dimN, false, false);
+  auto gpuMatrixC = Matrix::create(dimM, dimN, false, true);
+  auto cpuDenseC = Matrix::create(dimM, dimN, false, false);
+
   /*matrix init*/
   hl_stream_t stream(HPPL_STREAM_1);
   cpuMatrixA->randomizeUniform();
@@ -172,27 +227,6 @@ void testDSparseDMatrix() {
   cpuDenseC->copyFrom(*cpuMatrixC);
   hl_stream_synchronize(stream);
 
-  LOG(INFO) << "cpuMatrixA: ";
-  cpuMatrixA->print(std::cout);
-  LOG(INFO) << "cpuMatrixB: ";
-  (&cpuMatrixB)->print(std::cout);
-  LOG(INFO) << "cpuMatrixC: ";
-  cpuMatrixC->print(std::cout);
-
-  LOG(INFO) << "cpuDenseA: ";
-  cpuDenseA->print(std::cout);
-  LOG(INFO) << "cpuDenseB: ";
-  cpuDenseB->print(std::cout);
-  LOG(INFO) << "cpuDenseC: ";
-  cpuDenseC->print(std::cout);
-
-  LOG(INFO) << "gpuMatrixA: ";
-  gpuMatrixA->print(std::cout);
-  LOG(INFO) << "gpuMatrixB: ";
-  (&gpuMatrixB)->print(std::cout);
-  LOG(INFO) << "gpuMatrixC: ";
-  gpuMatrixC->print(std::cout);
-
   /*matrix mul*/
   BufferArgs cpuInputs;
   BufferArgs cpuOutputs;
@@ -215,15 +249,120 @@ void testDSparseDMatrix() {
   denseOutputs.addArg(*cpuDenseC, ADD_TO);
   cpuFunc->calc(denseInputs, denseOutputs);
 
-  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
-  hl_stream_synchronize(stream);
   /*check result*/
-  // autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC);
-  checkMatrixEqual(cpuMatrixC, cpuDenseC);
-  checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+  autotest::TensorCheckErr(*cpuMatrixC, *cpuDenseC);
+  autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC);
 }
 
-TEST(Matrix, SparseMatrixMul) {
+TEST(Matrix, DDSparseMul) {
   LOG(INFO) << "test for dense = dense * sparse matrix";
-  testDSparseDMatrix();
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, A sparse, B, C dense
+  * sparse = dense * dense
+  */
+void testSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real alpha = 1.0;
+  real beta = 1.0;
+  const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
+  cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
+  gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+
+  auto cpuMatrixA = Matrix::create(dimM, dimK, false, false);
+  auto gpuMatrixA = Matrix::create(dimM, dimK, false, true);
+  auto cpuDenseA = Matrix::create(dimM, dimK, false, false);
+
+  auto cpuMatrixB = Matrix::create(dimK, dimN, false, false);
+  auto gpuMatrixB = Matrix::create(dimK, dimN, false, true);
+  auto cpuDenseB = Matrix::create(dimK, dimN, false, false);
+
+  CpuSparseMatrix cpuMatrixC(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false);
+  CpuSparseMatrix gpuMatrixC_d2h(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false);
+  GpuSparseMatrix gpuMatrixC(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false);
+  CpuMatrix cpuDenseC(dimM, dimN, false);
+
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA->randomizeUniform();
+  cpuMatrixB->randomizeUniform();
+  cpuMatrixC.randomizeUniform();
+
+  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
+  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
+  gpuMatrixC.copyFrom(cpuMatrixC, stream);
+
+  cpuDenseA->copyFrom(*cpuMatrixA);
+  cpuDenseB->copyFrom(*cpuMatrixB);
+  cpuDenseC.copyFrom(cpuMatrixC);
+  hl_stream_synchronize(stream);
+
+  /*matrix mul*/
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  cpuInputs.addArg(*cpuMatrixA);
+  cpuInputs.addArg(*cpuMatrixB);
+  cpuOutputs.addArg(cpuMatrixC, ADD_TO);
+  cpuFunc->calc(cpuInputs, cpuOutputs);
+
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  gpuInputs.addArg(*gpuMatrixA);
+  gpuInputs.addArg(*gpuMatrixB);
+  gpuOutputs.addArg(gpuMatrixC, ADD_TO);
+  gpuFunc->calc(gpuInputs, gpuOutputs);
+
+  BufferArgs denseInputs;
+  BufferArgs denseOutputs;
+  denseInputs.addArg(*cpuDenseA);
+  denseInputs.addArg(*cpuDenseB);
+  denseOutputs.addArg(cpuDenseC, ADD_TO);
+  cpuFunc->calc(denseInputs, denseOutputs);
+
+  gpuMatrixC_d2h.copyFrom(gpuMatrixC, stream);
+  hl_stream_synchronize(stream);
+
+  /*check result*/
+  checkSMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+  checkSMatrixEqual2Dense(cpuMatrixC, cpuDenseC);
+}
+
+TEST(Matrix, SparseDDMul) {
+  LOG(INFO) << "test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 720a035ecbd26df01fe24c991982bbf7965ccbdc..3bae6d373f240fcc773644386b290ef9874828ae 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -177,7 +177,6 @@ GpuSparseMatrix::GpuSparseMatrix(real* value,
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
-    LOG(INFO) << "weight to matrix ";
   }
 }
 
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 9aa74b15193723970d80b5d1a4e0ac95341cd45a..47f461474622d13ea2f922a77348c78b450ec37f 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -30,6 +30,17 @@ void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
   }
 }
 
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
 void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
                        const CpuSparseMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
@@ -73,6 +84,36 @@ void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
   }
 }
 
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
 void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
                              const CpuMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());