Pass unit test for CpuMatrix::mul(CpuMatrix, CpuSparseMatrix)

and GpuMatrix::mul(CpuMatrix, GpuSparseMatrix)

Pass unit test for CpuMatrix::mul(CpuMatrix, CpuSparseMatrix)
and GpuMatrix::mul(CpuMatrix, GpuSparseMatrix)
1ca2846e · xutianbing · 2df8eec5 · 1ca2846e · 1ca2846e · 1ca2846e
5 changed file
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -32,14 +32,22 @@ const SparseMatrixArg& BufferArg::sparse() const {
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      /// todo(tianbing), make sure how to get NNZ
+      nnz_(sparse.getElementCnt()),
+      format_(sparse.getFormat()),
+      type_(sparse.getValueType()) {
  bufferType_ = TENSOR_SPARSE;
 }
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      /// todo(tianbing), make sure how to get NNZ
+      nnz_(sparse.getElementCnt()),
+      format_(sparse.getFormat()),
+      type_(sparse.getValueType()) {
  bufferType_ = TENSOR_SPARSE;
 }

--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -30,13 +30,6 @@ enum BufferType {
  TENSOR_SPARSE = 4
 };
-enum SparseDataType {
-  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
-  SPARSE_FLOAT_VALUE = 1
-};
-enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
@@ -272,8 +265,8 @@ public:
                  const BufferArg& row,
                  const BufferArg& col,
                  size_t nnz,
-                  SparseDataFormat format,
+                  SparseFormat format,
-                  SparseDataType type,
+                  SparseValueType type,
                  ArgType argType = UNSPECIFIED)
      : BufferArg(buf, valueType, shape, argType),
        row_(row),
@@ -286,9 +279,9 @@ public:
    CHECK_EQ(shape_.ndims(), (size_t)2);
    CHECK_EQ(row_.shape().ndims(), (size_t)1);
    CHECK_EQ(col_.shape().ndims(), (size_t)1);
-    if (format == SPARSE_CSR_FORMAT) {
+    if (format == SPARSE_CSR) {
      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format == SPARSE_CSC_FORMAT) {
+    } else if (format == SPARSE_CSC) {
      CHECK_EQ(nnz, row.shape()[0]);
    }
  }
@@ -310,8 +303,8 @@ public:
        shape_[0],
        shape_[1],
        nnz_,
-        static_cast<SparseValueType>(type_),
+        type_,
-        static_cast<SparseFormat>(format_),
+        format_,
        trans_);
  }
@@ -323,16 +316,16 @@ public:
  size_t nnz() const { return nnz_; }
-  SparseDataFormat dataFormat() const { return format_; }
+  SparseFormat dataFormat() const { return format_; }
-  SparseDataType dataType() const { return type_; }
+  SparseValueType dataType() const { return type_; }
 private:
  BufferArg row_;
  BufferArg col_;
  size_t nnz_;
-  SparseDataFormat format_;
+  SparseFormat format_;
-  SparseDataType type_;
+  SparseValueType type_;
 };
 }  // namespace paddle
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MulOp.h"
+/// todo(tianbing), delete it
+#include <iostream>
 #include "paddle/math/MathFunctions.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"
@@ -496,16 +498,48 @@ public:
    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto in1_mat = inputs[0].matrix<Device>();
+    /// todo(tianbing), support SparseMatrixArg for out_mat
-    if (inputs[0].isSparseArg()) {
+    auto out_mat = outputs[0].matrix<Device>();
-      in1_mat = inputs[0].sparse().SparseMatrix<Device>();
+    LOG(INFO) << "out_mat:";
+    out_mat.print(std::cout);
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg()) {
+      LOG(INFO) << "in1_mat:";
+      inputs[0].matrix<Device>().print(std::cout);
+      LOG(INFO) << "in2_mat:";
+      inputs[1].matrix<Device>().print(std::cout);
+      MulOp<Device>(out_mat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    alpha_,
+                    beta_);
+      return;
    }
-    auto in2_mat = inputs[1].matrix<Device>();
-    if (inputs[1].isSparseArg()) {
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg()) {
-      in2_mat = inputs[1].sparse().SparseMatrix<Device>();
+      LOG(INFO) << "in1_mat:";
+      inputs[0].matrix<Device>().print(std::cout);
+      LOG(INFO) << "in2_mat:";
+      inputs[1].sparse().SparseMatrix<Device>().print(std::cout);
+      MulOp<Device>(out_mat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    alpha_,
+                    beta_);
+      return;
+    }
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg()) {
+      LOG(INFO) << "in1_mat:";
+      inputs[0].sparse().SparseMatrix<Device>().print(std::cout);
+      LOG(INFO) << "in2_mat:";
+      inputs[1].matrix<Device>().print(std::cout);
+      MulOp<Device>(out_mat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    alpha_,
+                    beta_);
+      return;
    }
-    auto out_mat = outputs[0].matrix<Device>();
-    MulOp<Device>(out_mat, in1_mat, in2_mat, alpha_, beta_);
  }
 private:

--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include "Function.h"
+/// todo(tianbing), delete
+#include <iostream>
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"

--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
+/// todo(tianbing), delete
+#include <iostream>
 #include "FunctionTest.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@@ -72,6 +74,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
 }
 TEST(Matrix, mul) {
+  LOG(INFO) << "test for dense = dense * dense matrix";
  for (auto transa : {false, true}) {
    for (auto transb : {false, true}) {
      for (auto dimM : {1, 10, 100}) {
@@ -93,3 +96,134 @@ TEST(Matrix, mul) {
    }
  }
 }
+struct MatrixPara {
+  size_t height;
+  size_t width;
+  bool trans;
+  bool sparse;
+  size_t nnz;
+  SparseFormat format;
+};
+/**
+  * C += A * B, A, C dense, B sparse
+  */
+void testDSparseDMatrix() {
+  real alpha = 1.0;
+  real beta = 1.0;
+  const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
+  cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
+  gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+  constexpr size_t dimM = 2;
+  constexpr size_t dimN = 2;
+  constexpr size_t dimK = 3;
+  constexpr size_t NNZ = 3;
+  constexpr SparseFormat FORMAT = SPARSE_CSC;
+  MatrixPara paraA{dimM, dimK, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT};
+  MatrixPara paraB{dimK, dimN, /*trans*/ false, /*sparse*/ true, NNZ, FORMAT};
+  MatrixPara paraC{dimM, dimN, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT};
+  auto cpuMatrixA =
+      Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+  auto gpuMatrixA =
+      Matrix::create(paraA.height, paraA.width, paraA.trans, true);
+  auto cpuDenseA =
+      Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+  CpuSparseMatrix cpuMatrixB(paraB.height,
+                             paraB.width,
+                             paraB.nnz,
+                             FLOAT_VALUE,
+                             paraB.format,
+                             paraB.trans);
+  GpuSparseMatrix gpuMatrixB(paraB.height,
+                             paraB.width,
+                             paraB.nnz,
+                             FLOAT_VALUE,
+                             paraB.format,
+                             paraB.trans);
+  auto cpuDenseB =
+      Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+  auto cpuMatrixC =
+      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  auto gpuMatrixC =
+      Matrix::create(paraC.height, paraC.width, paraC.trans, true);
+  auto cpuDenseC =
+      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  auto gpuMatrixC_d2h =
+      Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA->randomizeUniform();
+  cpuMatrixB.randomizeUniform();
+  cpuMatrixC->randomizeUniform();
+  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
+  gpuMatrixB.copyFrom(cpuMatrixB, stream);
+  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
+  cpuDenseA->copyFrom(*cpuMatrixA);
+  cpuDenseB->copyFrom(cpuMatrixB);
+  cpuDenseC->copyFrom(*cpuMatrixC);
+  hl_stream_synchronize(stream);
+  LOG(INFO) << "cpuMatrixA: ";
+  cpuMatrixA->print(std::cout);
+  LOG(INFO) << "cpuMatrixB: ";
+  (&cpuMatrixB)->print(std::cout);
+  LOG(INFO) << "cpuMatrixC: ";
+  cpuMatrixC->print(std::cout);
+  LOG(INFO) << "cpuDenseA: ";
+  cpuDenseA->print(std::cout);
+  LOG(INFO) << "cpuDenseB: ";
+  cpuDenseB->print(std::cout);
+  LOG(INFO) << "cpuDenseC: ";
+  cpuDenseC->print(std::cout);
+  LOG(INFO) << "gpuMatrixA: ";
+  gpuMatrixA->print(std::cout);
+  LOG(INFO) << "gpuMatrixB: ";
+  (&gpuMatrixB)->print(std::cout);
+  LOG(INFO) << "gpuMatrixC: ";
+  gpuMatrixC->print(std::cout);
+  /*matrix mul*/
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  cpuInputs.addArg(*cpuMatrixA);
+  cpuInputs.addArg(cpuMatrixB);
+  cpuOutputs.addArg(*cpuMatrixC, ADD_TO);
+  cpuFunc->calc(cpuInputs, cpuOutputs);
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  gpuInputs.addArg(*gpuMatrixA);
+  gpuInputs.addArg(gpuMatrixB);
+  gpuOutputs.addArg(*gpuMatrixC, ADD_TO);
+  gpuFunc->calc(gpuInputs, gpuOutputs);
+  BufferArgs denseInputs;
+  BufferArgs denseOutputs;
+  denseInputs.addArg(*cpuDenseA);
+  denseInputs.addArg(*cpuDenseB);
+  denseOutputs.addArg(*cpuDenseC, ADD_TO);
+  cpuFunc->calc(denseInputs, denseOutputs);
+  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
+  hl_stream_synchronize(stream);
+  /*check result*/
+  // autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC);
+  checkMatrixEqual(cpuMatrixC, cpuDenseC);
+  checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+}
+TEST(Matrix, SparseMatrixMul) {
+  LOG(INFO) << "test for dense = dense * sparse matrix";
+  testDSparseDMatrix();
+}