From 2df8eec5204506ddfb7f55e8c88e0271fbf9bf3b Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Mon, 16 Jan 2017 20:30:27 -0800
Subject: [PATCH] Pass Unit test for GpuMatrix::mul(GpuMatrix, GpuMatrix) and
 CpuMatrix::mul(CpuMatrix, CpuMatrix)

---
 paddle/function/BufferArg.cpp |  6 +--
 paddle/function/BufferArg.h   | 17 +++----
 paddle/function/MulOp.cpp     | 12 ++---
 paddle/function/MulOp.h       |  7 +++
 paddle/function/MulOpGpu.cu   |  9 ++++
 paddle/function/MulOpTest.cpp | 85 +++++++++++++++++++++++------------
 6 files changed, 88 insertions(+), 48 deletions(-)
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index 4064daf415..5d595deb12 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -32,16 +32,14 @@ const SparseMatrixArg& BufferArg::sparse() const {
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      trans_(const_cast<CpuSparseMatrix&>(sparse).getTranspose()) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
   bufferType_ = TENSOR_SPARSE;
 }
 
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      trans_(const_cast<GpuSparseMatrix&>(sparse).getTranspose()) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
   bufferType_ = TENSOR_SPARSE;
 }
 
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 1f86f49911..2da1115ec9 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -98,7 +98,8 @@ public:
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(2),
-        argType_(argType) {
+        argType_(argType),
+        trans_(matrix.isTransposed()) {
     bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
@@ -111,7 +112,8 @@ public:
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(shape),
-        argType_(argType) {
+        argType_(argType),
+        trans_(matrix.isTransposed()) {
     bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
@@ -143,7 +145,7 @@ public:
     // CHECK(deviceType_ == DType);
     CHECK_EQ((size_t)2, shape_.ndims());
     return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1], trans_);
   }
 
   template <typename VType, DeviceType DType>
@@ -179,6 +181,7 @@ protected:
   TensorShape shape_;
   BufferType bufferType_{TENSOR_UNKNOWN};
   ArgType argType_{UNSPECIFIED};
+  bool trans_{false};
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
@@ -271,15 +274,13 @@ public:
                   size_t nnz,
                   SparseDataFormat format,
                   SparseDataType type,
-                  bool trans = false,
                   ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
         format_(format),
-        type_(type),
-        trans_(trans) {
+        type_(type) {
     bufferType_ = TENSOR_SPARSE;
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
     CHECK_EQ(shape_.ndims(), (size_t)2);
@@ -322,8 +323,6 @@ public:
 
   size_t nnz() const { return nnz_; }
 
-  bool isTranspose() const { return trans_; }
-
   SparseDataFormat dataFormat() const { return format_; }
 
   SparseDataType dataType() const { return type_; }
@@ -334,8 +333,6 @@ private:
   size_t nnz_;
   SparseDataFormat format_;
   SparseDataType type_;
-  /// todo(tianbing), move trans_ up to BufferArg
-  bool trans_;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 7d34118252..1c593bb083 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -483,8 +483,8 @@ template <DeviceType Device>
 class MulFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
-    scaleAB_ = config.get<real>("scaleAB");
-    scaleT_ = config.get<real>("scaleT");
+    alpha_ = config.get<real>("scaleAB");
+    beta_ = config.get<real>("scaleT");
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -494,7 +494,7 @@ public:
     CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
     CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
     auto in1_mat = inputs[0].matrix<Device>();
     if (inputs[0].isSparseArg()) {
@@ -505,12 +505,12 @@ public:
       in2_mat = inputs[1].sparse().SparseMatrix<Device>();
     }
     auto out_mat = outputs[0].matrix<Device>();
-    MulOp<Device>(out_mat, in1_mat, in2_mat, scaleAB_, scaleT_);
+    MulOp<Device>(out_mat, in1_mat, in2_mat, alpha_, beta_);
   }
 
 private:
-  real scaleAB_;
-  real scaleT_;
+  real alpha_;
+  real beta_;
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
index f3699f8c78..b7b1f56af1 100644
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@@ -68,4 +68,11 @@ void MulOp(GpuMatrix& out,
            real scaleAB,
            real scaleT);
 
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
 }  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
index 73d788a474..3691c7f320 100644
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -170,4 +170,13 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
   }
 }
 
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scale_ab,
+                            real scale_t) {
+/// todo(tianbing), implement it
+}
+
 }  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index ce9d37d664..3229193660 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -16,50 +16,79 @@ limitations under the License. */
 #include "FunctionTest.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/math/tests/test_matrixUtil.h"
 #include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 
-void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) {
-  /// todo(tianbing) check CPU/GPU
+/**
+ *  C = alpha * C + beta * (A * B)
+ */
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  real alpha = 1.5;
+  real beta = 2.0;
+
+  const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU");
+  cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
   const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
-  gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2));
+  gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
 
-  int nnz = M * N * rate;
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+  int heightA = (transa == false) ? dimM : dimK;
+  int widthA = (transa == false) ? dimK : dimM;
+  int heightB = (transb == false) ? dimK : dimN;
+  int widthB = (transb == false) ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
 
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+  auto cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  auto cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  auto cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  auto gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  auto gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  auto gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
 
   cpuA->randomizeUniform();
   cpuB->randomizeUniform();
   cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
 
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  cpuInputs.addArg(*cpuA);
+  cpuInputs.addArg(*cpuB);
+  cpuOutputs.addArg(*cpuC, ADD_TO);
+  cpuFunc->calc(cpuInputs, cpuOutputs);
 
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*gpuA->getTranspose());
-  inputs.addArg(*gpuB->getTranspose());
-  outputs.addArg(*gpuC, ASSIGN_TO);
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  gpuInputs.addArg(*gpuA);
+  gpuInputs.addArg(*gpuB);
+  gpuOutputs.addArg(*gpuC, ADD_TO);
+  gpuFunc->calc(gpuInputs, gpuOutputs);
 
-  gpuFunc->calc(inputs, outputs);
+  autotest::TensorCheckErr(*cpuC, *gpuC);
 }
 
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100}) {
-      for (auto K : {100}) {
-        /// todo(tianbing), add scaleAB and scaleT
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05, 1, 1);
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 10, 100}) {
+        for (auto dimN : {1, 10}) {
+          for (auto dimK : {8}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
       }
     }
   }
-- 
GitLab