Pass Unit test for GpuMatrix::mul(GpuMatrix, GpuMatrix) and CpuMatrix::mul(CpuMatrix, CpuMatrix)

2df8eec5 · xutianbing · 1f0cbcf3 · 2df8eec5 · 2df8eec5 · 2df8eec5
6 changed file
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -32,16 +32,14 @@ const SparseMatrixArg& BufferArg::sparse() const {
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      trans_(const_cast<CpuSparseMatrix&>(sparse).getTranspose()) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
  bufferType_ = TENSOR_SPARSE;
 }

 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      trans_(const_cast<GpuSparseMatrix&>(sparse).getTranspose()) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
  bufferType_ = TENSOR_SPARSE;
 }


--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -98,7 +98,8 @@ public:
            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
        valueType_(DataType<real>::value),
        shape_(2),
-        argType_(argType) {
+        argType_(argType),
+        trans_(matrix.isTransposed()) {
    bufferType_ = TENSOR_NORMAL;
    shape_.setDim(0, matrix.getHeight());
    shape_.setDim(1, matrix.getWidth());
@@ -111,7 +112,8 @@ public:
            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
        valueType_(DataType<real>::value),
        shape_(shape),
-        argType_(argType) {
+        argType_(argType),
+        trans_(matrix.isTransposed()) {
    bufferType_ = TENSOR_NORMAL;
    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
  }
@@ -143,7 +145,7 @@ public:
    // CHECK(deviceType_ == DType);
    CHECK_EQ((size_t)2, shape_.ndims());
    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1], trans_);
  }

  template <typename VType, DeviceType DType>
@@ -179,6 +181,7 @@ protected:
  TensorShape shape_;
  BufferType bufferType_{TENSOR_UNKNOWN};
  ArgType argType_{UNSPECIFIED};
+  bool trans_{false};
  // leading dimensions. The size is dims_.size()
  // Dims lds_;
 };
@@ -271,15 +274,13 @@ public:
                  size_t nnz,
                  SparseDataFormat format,
                  SparseDataType type,
-                  bool trans = false,
                  ArgType argType = UNSPECIFIED)
      : BufferArg(buf, valueType, shape, argType),
        row_(row),
        col_(col),
        nnz_(nnz),
        format_(format),
-        type_(type),
-        trans_(trans) {
+        type_(type) {
    bufferType_ = TENSOR_SPARSE;
    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
    CHECK_EQ(shape_.ndims(), (size_t)2);
@@ -322,8 +323,6 @@ public:

  size_t nnz() const { return nnz_; }

-  bool isTranspose() const { return trans_; }
-
  SparseDataFormat dataFormat() const { return format_; }

  SparseDataType dataType() const { return type_; }
@@ -334,8 +333,6 @@ private:
  size_t nnz_;
  SparseDataFormat format_;
  SparseDataType type_;
-  /// todo(tianbing), move trans_ up to BufferArg
-  bool trans_;
 };

 }  // namespace paddle
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -483,8 +483,8 @@ template <DeviceType Device>
 class MulFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override {
-    scaleAB_ = config.get<real>("scaleAB");
-    scaleT_ = config.get<real>("scaleT");
+    alpha_ = config.get<real>("scaleAB");
+    beta_ = config.get<real>("scaleT");
  }

  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -494,7 +494,7 @@ public:
    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);

    auto in1_mat = inputs[0].matrix<Device>();
    if (inputs[0].isSparseArg()) {
@@ -505,12 +505,12 @@ public:
      in2_mat = inputs[1].sparse().SparseMatrix<Device>();
    }
    auto out_mat = outputs[0].matrix<Device>();
-    MulOp<Device>(out_mat, in1_mat, in2_mat, scaleAB_, scaleT_);
+    MulOp<Device>(out_mat, in1_mat, in2_mat, alpha_, beta_);
  }

 private:
-  real scaleAB_;
-  real scaleT_;
+  real alpha_;
+  real beta_;
 };

 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);

--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@@ -68,4 +68,11 @@ void MulOp(GpuMatrix& out,
           real scaleAB,
           real scaleT);

+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
 }  // namespace paddle
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -170,4 +170,13 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
  }
 }

+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scale_ab,
+                            real scale_t) {
+/// todo(tianbing), implement it
+}
+
 }  // namespace paddle
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -16,50 +16,79 @@ limitations under the License. */
 #include "FunctionTest.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/math/tests/test_matrixUtil.h"
 #include "paddle/testing/TestUtil.h"

 using namespace paddle;  // NOLINT

-void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) {
-  /// todo(tianbing) check CPU/GPU
+/**
+ *  C = alpha * C + beta * (A * B)
+ */
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  real alpha = 1.5;
+  real beta = 2.0;
+
+  const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
+  cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
-  gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2));
+  gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));

-  int nnz = M * N * rate;
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+  int heightA = (transa == false) ? dimM : dimK;
+  int widthA = (transa == false) ? dimK : dimM;
+  int heightB = (transb == false) ? dimK : dimN;
+  int widthB = (transb == false) ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;

-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+  auto cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  auto cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  auto cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  auto gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  auto gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  auto gpuC = std::make_shared<GpuMatrix>(heightC, widthC);

  cpuA->randomizeUniform();
  cpuB->randomizeUniform();
  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);

-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  cpuInputs.addArg(*cpuA);
+  cpuInputs.addArg(*cpuB);
+  cpuOutputs.addArg(*cpuC, ADD_TO);
+  cpuFunc->calc(cpuInputs, cpuOutputs);

-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*gpuA->getTranspose());
-  inputs.addArg(*gpuB->getTranspose());
-  outputs.addArg(*gpuC, ASSIGN_TO);
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  gpuInputs.addArg(*gpuA);
+  gpuInputs.addArg(*gpuB);
+  gpuOutputs.addArg(*gpuC, ADD_TO);
+  gpuFunc->calc(gpuInputs, gpuOutputs);

-  gpuFunc->calc(inputs, outputs);
+  autotest::TensorCheckErr(*cpuC, *gpuC);
 }

-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100}) {
-      for (auto K : {100}) {
-        /// todo(tianbing), add scaleAB and scaleT
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05, 1, 1);
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 10, 100}) {
+        for (auto dimN : {1, 10}) {
+          for (auto dimK : {8}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
      }
    }
  }