From b3be73580717f571d37bb655887d6449024d0ab7 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Mon, 23 Jan 2017 15:00:36 -0800
Subject: [PATCH] Daoyuan's comments.

---
 paddle/function/BufferArg.h    |  37 +++----
 paddle/function/FunctionTest.h |  40 +++-----
 paddle/function/MulOp.cpp      | 173 ++++++++++++++++-----------------
 paddle/function/MulOp.h        |  40 ++++++--
 paddle/function/MulOpGpu.cu    | 114 +++++++---------------
 paddle/function/MulOpTest.cpp  |  72 +++++++-------
 6 files changed, 217 insertions(+), 259 deletions(-)
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 7565047a570..f3634364ab2 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -71,24 +71,17 @@ public:
 public:
   BufferArg(ValueType valueType,
             const TensorShape& shape,
-            ArgType argType = UNSPECIFIED,
-            bool trans = false)
+            ArgType argType = UNSPECIFIED)
       : buf_(nullptr),
         valueType_(valueType),
         shape_(shape),
-        argType_(argType),
-        trans_(trans) {}
+        argType_(argType) {}
 
   BufferArg(void* buf,
             ValueType valueType,
             const TensorShape& shape,
-            ArgType argType = UNSPECIFIED,
-            bool trans = false)
-      : buf_(buf),
-        valueType_(valueType),
-        shape_(shape),
-        argType_(argType),
-        trans_(trans) {}
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
 
   BufferArg(void* buf, ValueType valueType)
       : buf_(buf), valueType_(valueType) {}
@@ -98,8 +91,7 @@ public:
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(2),
-        argType_(argType),
-        trans_(matrix.isTransposed()) {
+        argType_(argType) {
     bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
@@ -112,8 +104,7 @@ public:
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(shape),
-        argType_(argType),
-        trans_(matrix.isTransposed()) {
+        argType_(argType) {
     bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
@@ -145,7 +136,7 @@ public:
     // CHECK(deviceType_ == DType);
     CHECK_EQ((size_t)2, shape_.ndims());
     return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1], trans_);
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
   }
 
   template <typename VType, DeviceType DType>
@@ -169,7 +160,6 @@ public:
   ValueType valueType() const { return valueType_; }
   BufferType bufferType() const { return bufferType_; }
   const TensorShape& shape() const { return shape_; }
-  bool isTransposed() const { return trans_; }
   bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
   bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
   virtual size_t numElements() const { return shape_.getElements(); }
@@ -183,7 +173,6 @@ protected:
   TensorShape shape_;
   BufferType bufferType_{TENSOR_UNKNOWN};
   ArgType argType_{UNSPECIFIED};
-  bool trans_{false};
   // todo(tianbing), add deviceType_
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
@@ -277,9 +266,8 @@ public:
                   size_t nnz,
                   SparseFormat format,
                   SparseValueType type,
-                  ArgType argType = UNSPECIFIED,
-                  bool trans = false)
-      : BufferArg(buf, valueType, shape, argType, trans),
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
@@ -302,9 +290,8 @@ public:
                   size_t nnz,
                   SparseFormat format,
                   SparseValueType type,
-                  ArgType argType = UNSPECIFIED,
-                  bool trans = false)
-      : BufferArg(valueType, shape, argType, trans),
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
         /// len of row_ : height + 1 (CSR), buf_ == nullptr
         row_(format == SPARSE_CSR
                  ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape[0] + 1})
@@ -343,7 +330,7 @@ public:
         nnz_,
         type_,
         format_,
-        trans_);
+        false);
   }
 
   ~SparseMatrixArg() {}
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 6515cba1629..baa94abffa0 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -64,22 +64,14 @@ public:
     cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
     gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
 
-    cpuInputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
-                                    input.valueType(),
-                                    input.shape(),
-                                    UNSPECIFIED,
-                                    input.isTransposed()));
-    gpuInputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
-                                    input.valueType(),
-                                    input.shape(),
-                                    UNSPECIFIED,
-                                    input.isTransposed()));
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
   }
 
   // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+  void addOutputs(const BufferArg& output, ArgType argType = ADD_TO) {
     size_t size =
         output.shape().getElements() * sizeOfValuType(output.valueType());
     cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
@@ -89,16 +81,14 @@ public:
         cpuMemory_.back()->getBuf(),
         output.valueType(),
         output.shape(),
-        // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
-        argType,
-        output.isTransposed()));
+        // todo(tianbing), argType = output.getArgType(), but default ADD_TO
+        argType));
     gpuOutputs_.emplace_back(std::make_shared<BufferArg>(
         gpuMemory_.back()->getBuf(),
         output.valueType(),
         output.shape(),
-        // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
-        argType,
-        output.isTransposed()));
+        // todo(tianbing), argType = output.getArgType(), but default ADD_TO
+        argType));
   }
 
   /// add and init output sparse matrix
@@ -107,15 +97,13 @@ public:
                                                    output.shape()[1],
                                                    output.nnz(),
                                                    output.dataType(),
-                                                   output.dataFormat(),
-                                                   output.isTransposed());
+                                                   output.dataFormat());
 
     gpuSparse_ = std::make_shared<GpuSparseMatrix>(output.shape()[0],
                                                    output.shape()[1],
                                                    output.nnz(),
                                                    output.dataType(),
-                                                   output.dataFormat(),
-                                                   output.isTransposed());
+                                                   output.dataFormat());
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
@@ -154,15 +142,13 @@ public:
                                                    input.shape()[1],
                                                    input.nnz(),
                                                    input.dataType(),
-                                                   input.dataFormat(),
-                                                   input.isTransposed());
+                                                   input.dataFormat());
 
     gpuSparse_ = std::make_shared<GpuSparseMatrix>(input.shape()[0],
                                                    input.shape()[1],
                                                    input.nnz(),
                                                    input.dataType(),
-                                                   input.dataFormat(),
-                                                   input.isTransposed());
+                                                   input.dataFormat());
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 4d7f1a7fa92..965115121eb 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -46,21 +46,11 @@ void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
                             const CpuMatrix& a,
                             const CpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
-  CHECK(!out.isTransposed()) << "Not supported";
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
   CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  CHECK(!a.isTransposed() || !b.isTransposed())
-      << "Not support both a and b are transpose matrices";
-
-  size_t height = out.getHeight();
-  size_t width = out.getWidth();
-  size_t aRow = !a.isTransposed() ? a.getHeight() : a.getWidth();
-  size_t aCol = !a.isTransposed() ? a.getWidth() : a.getHeight();
-  size_t bRow = !b.isTransposed() ? b.getHeight() : b.getWidth();
-  size_t bCol = !b.isTransposed() ? b.getWidth() : b.getHeight();
-  /// C = A * B, for matrix format
-  CHECK(aCol == bRow && aRow == height && bCol == width);
-
   if (scaleT == 0) {
     out.zeroMem();
   }
@@ -69,12 +59,14 @@ void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
   real* C = out.getValue();
   int* rows = out.getRows();
   int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
 
   /// SPARSE_CSC, {a any, b not trans}
   if (out.getFormat() == SPARSE_CSC) {
     /// b not trans and a any
-    CHECK(!b.isTransposed());
-    size_t m = !a.isTransposed() ? a.getWidth() : a.getHeight();
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
     for (size_t i = 0; i < width; i++) {
       size_t start = out.getColStartIdx(i);
       size_t end = out.getColStartIdx(i + 1);
@@ -82,9 +74,8 @@ void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
         real sum = 0;
         size_t rowIdx = rows[j];
         for (size_t k = 0; k < m; k++) {
-          sum +=
-              (!a.isTransposed() ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-              B[k * width + i];
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
         }
         C[j] = scaleAB * sum + scaleT * C[j];
       }
@@ -95,7 +86,7 @@ void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
   /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
   if (out.getFormat() == SPARSE_CSR) {
     /// a and b can not both transpose
-    CHECK(!(a.isTransposed() && b.isTransposed()));
+    CHECK(!(aTrans && bTrans));
     size_t m = a.getWidth();
     for (size_t i = 0; i < height; i++) {
       size_t start = out.getRowStartIdx(i);
@@ -104,9 +95,8 @@ void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
         real sum = 0;
         size_t colIdx = cols[j];
         for (size_t k = 0; k < m; k++) {
-          sum +=
-              (!a.isTransposed() ? A[i * m + k] : A[k * height + i]) *
-              (!b.isTransposed() ? B[k * width + colIdx] : B[colIdx * m + k]);
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
         }
         C[j] = scaleAB * sum + scaleT * C[j];
       }
@@ -120,25 +110,15 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                             const CpuMatrix& a,
                             const CpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
-  CHECK(!out.isTransposed()) << "out matrix transpose not supported";
-  CBLAS_TRANSPOSE aTrans = a.isTransposed() ? CblasTrans : CblasNoTrans;
-  size_t aRow = a.isTransposed() ? a.getWidth() : a.getHeight();
-  size_t aCol = a.isTransposed() ? a.getHeight() : a.getWidth();
-  CBLAS_TRANSPOSE bTrans = b.isTransposed() ? CblasTrans : CblasNoTrans;
-  size_t bRow = b.isTransposed() ? b.getWidth() : b.getHeight();
-  size_t bCol = b.isTransposed() ? b.getHeight() : b.getWidth();
-
-  /// C = A * B, for matrix format
-  CHECK_EQ(aCol, bRow);
-  CHECK_EQ(aRow, out.getHeight());
-  CHECK_EQ(bCol, out.getWidth());
-
-  GEMM(aTrans,
-       bTrans,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
+  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+       bTrans ? CblasTrans : CblasNoTrans,
        out.getHeight(),
        out.getWidth(),
-       aCol,
+       !aTrans ? a.getWidth() : a.getHeight(),
        scaleAB,
        a.getData(),
        a.getStride(),
@@ -154,21 +134,12 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                             const CpuSparseMatrix& a,
                             const CpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
-  CHECK(!out.isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not support";
-  CHECK_EQ(scaleAB, static_cast<real>(1.0)) << "Not supported";
-  CHECK_EQ(a.getFormat(), SPARSE_CSR) << "Not supported";
-
-  if (!a.isTransposed()) {
-    CHECK(b.getHeight() == a.getWidth() && a.getHeight() == out.getHeight() &&
-          b.getWidth() == out.getWidth());
-  } else {
-    CHECK(b.getHeight() == a.getHeight() && a.getWidth() == out.getHeight() &&
-          b.getWidth() == out.getWidth());
-  }
-
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
+  CHECK_EQ(a.getFormat(), SPARSE_CSR)
+      << "Not supported SPARSE_CSR format for a";
   if (scaleT == 0) {
     out.zeroMem();
   }
@@ -185,9 +156,9 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
     const int start = a.getRowStartIdx(i);
     const int end = a.getRowStartIdx(i + 1);
     for (int j = start; j < end; ++j) {
-      vecAddTo(!a.isTransposed() ? out.getRow(i) : out.getRow(cols[j]),
-               !a.isTransposed() ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                                 : const_cast<CpuMatrix&>(b).getRow(i),
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
                (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
                out.getWidth());
     }
@@ -199,19 +170,10 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                             const CpuMatrix& a,
                             const CpuSparseMatrix& b,
                             real scaleAB,
-                            real scaleT) {
-  CHECK(!out.trans_) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-  if (!b.isTransposed()) {  /// b is not Transpose
-    CHECK(b.getHeight() == a.getWidth() && a.getHeight() == out.getHeight() &&
-          b.getWidth() == out.getWidth());
-  } else {
-    CHECK(b.getHeight() == out.getWidth() && a.getHeight() == out.getHeight() &&
-          b.getWidth() == a.getWidth());
-  }
-
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
   if (scaleT == 0) {
     out.zeroMem();
   }
@@ -227,8 +189,8 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
       int start = b.getColStartIdx(j);
       int end = b.getColStartIdx(j + 1);
       for (int i = start; i < end; ++i) {
-        colVecAddTo(!b.isTransposed() ? C + j : C + rows[i],
-                    !b.isTransposed() ? A + rows[i] : A + j,
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
                     (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
                     out.getHeight(),
                     out.getWidth(),
@@ -244,8 +206,8 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
       int start = b.getRowStartIdx(j);
       int end = b.getRowStartIdx(j + 1);
       for (int i = start; i < end; ++i) {
-        colVecAddTo(!b.isTransposed() ? C + cols[i] : C + j,
-                    !b.isTransposed() ? A + j : A + cols[i],
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
                     (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
                     out.getHeight(),
                     out.getWidth(),
@@ -270,16 +232,43 @@ public:
   void init(const FuncConfig& config) override {
     alpha_ = config.get<real>("scaleAB");
     beta_ = config.get<real>("scaleT");
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+    cTrans_ = config.get<bool>("cTrans");
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!cTrans_) << "output matrix should not be transposed";
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
     CHECK_EQ((size_t)2, inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
     CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
     CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
     CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B or C += A * B
+    CHECK_EQ(alpha_, static_cast<real>(1.0));
+    CHECK((beta_ == 0 && outputs[0].getArgType() == ASSIGN_TO) ||
+          (beta_ == 1 && outputs[0].getArgType() == ADD_TO));
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
 
     auto outMat = outputs[0].matrix<Device>();
     /// matrix = matrix * matrix
@@ -289,29 +278,40 @@ public:
                     inputs[0].matrix<Device>(),
                     inputs[1].matrix<Device>(),
                     alpha_,
-                    beta_);
+                    beta_,
+                    aTrans_,
+                    bTrans_,
+                    cTrans_);
       return;
     }
 
     /// matrix = matrix * sparse matrix
     if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
         !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
       MulOp<Device>(outMat,
                     inputs[0].matrix<Device>(),
                     inputs[1].sparse().SparseMatrix<Device>(),
                     alpha_,
-                    beta_);
+                    beta_,
+                    aTrans_,
+                    bTrans_,
+                    cTrans_);
       return;
     }
 
     /// matrix = sparse matrix * matrix
     if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
         !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
       MulOp<Device>(outMat,
                     inputs[0].sparse().SparseMatrix<Device>(),
                     inputs[1].matrix<Device>(),
                     alpha_,
-                    beta_);
+                    beta_,
+                    aTrans_,
+                    bTrans_,
+                    cTrans_);
       return;
     }
 
@@ -319,18 +319,14 @@ public:
     auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
     if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
         outputs[0].isSparseArg()) {
-      /*
-      LOG(INFO) << "input0";
-      inputs[0].matrix<Device>().print(std::cout);
-      LOG(INFO) << "input1";
-      inputs[1].matrix<Device>().print(std::cout);
-      LOG(INFO) << "output sparse matrix";
-      outSparseMat.print(std::cout); */
       MulOp<Device>(outSparseMat,
                     inputs[0].matrix<Device>(),
                     inputs[1].matrix<Device>(),
                     alpha_,
-                    beta_);
+                    beta_,
+                    aTrans_,
+                    bTrans_,
+                    cTrans_);
       return;
     }
   }
@@ -338,6 +334,9 @@ public:
 private:
   real alpha_;
   real beta_;
+  bool aTrans_;
+  bool bTrans_;
+  bool cTrans_;
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
index fda5b092498..a7703482255 100644
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@@ -26,55 +26,79 @@ void MulOp(CpuMatrix& out,
            const CpuMatrix& a,
            const CpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
            const CpuSparseMatrix& a,
            const CpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
            const CpuMatrix& a,
            const CpuSparseMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(CpuSparseMatrix& out,
            const CpuMatrix& a,
            const CpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
            const GpuMatrix& a,
            const GpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
            const GpuSparseMatrix& a,
            const GpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
            const GpuMatrix& a,
            const GpuSparseMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 template <DeviceType DType>
 void MulOp(GpuSparseMatrix& out,
            const GpuMatrix& a,
            const GpuMatrix& b,
            real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);
 
 }  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
index 09d2a764911..94bee72034f 100644
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -27,38 +27,22 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                             const GpuMatrix& a,
                             const GpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
-  CHECK(!out.isTransposed()) << "Transpose not supported for out matrix";
-  if (!a.isTransposed() && !b.isTransposed()) {
-      /// a : M * K, b: K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getHeight());
-  } else if (a.isTransposed() && !b.isTransposed()) {
-      /// a : K * M, b : K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getWidth() &&
-              a.getHeight() == b.getHeight());
-  } else if (!a.isTransposed() && b.isTransposed()) {
-      /// a: M * K, b : N * K
-      CHECK(out.getWidth() == b.getHeight() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getWidth());
-  } else {
-    LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
-  }
-
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
   real* aData = const_cast<real*>(a.getData());
   real* bData = const_cast<real*>(b.getData());
   real* outData = const_cast<real*>(out.getData());
   hl_matrix_mul(aData,
-                !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
                 bData,
-                !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
                 outData,
                 out.getHeight(),
                 out.getWidth(),
-                !a.isTransposed() ? a.getWidth() : a.getHeight(),
+                !aTrans ? a.getWidth() : a.getHeight(),
                 scaleAB,
                 scaleT,
                 a.getStride(),
@@ -75,27 +59,19 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                             const GpuSparseMatrix& a,
                             const GpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
   CHECK(out.isContiguous());
   CHECK(b.isContiguous());
-  CHECK(b.useGpu_) << "Matrix type are not equal";
-  CHECK(!out.isTransposed() && !b.isTransposed()) << "not supported";
-  if (!a.isTransposed()) {
-    /// a: M * K,  b: K * N
-    CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getHeight()
-        && a.getWidth() == b.getHeight()) << "Matrix dimensions are not equal";
-  } else {
-    /// a: K * M, transpose,  b: K * N
-    CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getWidth()
-        && a.getHeight() == b.getHeight()) << "Matrix dimensions are not equal";
-  }
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
 
-  hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
   hl_sparse_matrix_s aData = a.sMatrix_.get();
   real* bData = const_cast<real*>(b.getData());
   real* outData = const_cast<real*>(out.getData());
   hl_matrix_csr_mul_dense(aData,
-                          aTrans,
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
                           bData,
                           HPPL_OP_N,
                           outData,
@@ -115,25 +91,14 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                             const GpuMatrix& a,
                             const GpuSparseMatrix& b,
                             real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
   CHECK(out.isContiguous());
   CHECK(a.isContiguous());
-  CHECK(a.useGpu_) << "Matrix type are not equal";
-  if (!b.isTransposed()) {
-      /// a : M * K, b : K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getHeight())
-          << "Matrix dimensions are not equal";
-  } else {
-      /// a : M * K, b : N * K, transpose
-      CHECK(out.getWidth() == b.getHeight() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getWidth())
-          << "Matrix dimensions are not equal";
-  }
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
 
-  hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
   hl_sparse_matrix_s bData = b.sMatrix_.get();
   real* aData = const_cast<real*>(a.getData());
   real* outData = const_cast<real*>(out.getData());
@@ -142,7 +107,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
     hl_matrix_dense_mul_csc(aData,
                             HPPL_OP_N,
                             bData,
-                            bTrans,
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
                             outData,
                             out.getHeight(),
                             out.getWidth(),
@@ -153,7 +118,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
     hl_matrix_dense_mul_csr(aData,
                             HPPL_OP_N,
                             bData,
-                            bTrans,
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
                             outData,
                             out.getHeight(),
                             out.getWidth(),
@@ -168,35 +133,26 @@ void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
                             const GpuMatrix& a,
                             const GpuMatrix& b,
                             real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
   CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  CHECK(!out.isTransposed()) << "Transpose is not supported for out matrix";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK(out.getHeight() == a.getHeight() &&
-         out.getWidth() == b.getWidth() &&
-         a.getWidth() == b.getHeight());
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK(out.getHeight() == a.getWidth() &&
-          out.getWidth() == b.getWidth() &&
-          a.getHeight() == b.getHeight());
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK(out.getHeight() == a.getHeight() &&
-          out.getWidth() == b.getHeight() &&
-          a.getWidth() == b.getWidth());
-  } else {
-    LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
-  }
 
-  hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
-  int dimK = !b.isTransposed() ? b.getHeight() : b.getWidth();
   real* aData = const_cast<real*>(a.getData());
   real* bData = const_cast<real*>(b.getData());
   hl_sparse_matrix_s outData = out.sMatrix_.get();
 
-  hl_sparse_matrix_mul(aData, aTrans, bData, bTrans, outData,
-      out.getHeight(), out.getWidth(), dimK, scaleAB, scaleT);
+  hl_sparse_matrix_mul(aData,
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       bData,
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       outData,
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index 05460c80970..f67fa41612c 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -39,18 +39,21 @@ void testFuncDDDMatrix(
   size_t widthC = dimN;
   // init Test object
   FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", transa)
+                           .set("bTrans", transb)
+                           .set("cTrans", false));
   // prepare input arguments
   /// matrix A : HA * WA
-  test.addInputs(BufferArg(
-      VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}, UNSPECIFIED, transa));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
   /// matrix B: HB * WB
-  test.addInputs(BufferArg(
-      VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}, UNSPECIFIED, transb));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
 
   /// output matrix C: HC * WC
   test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  ADD_TO);
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
   // run Function
   test.run();
 }
@@ -88,21 +91,22 @@ void testFuncDSparseDMatrix(
   real beta = 1.0;
   // init Test object
   FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
   // prepare input arguments
   /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                 TensorShape{dimM, dimK},
-                                 nnz,
-                                 FORMAT,
-                                 FLOAT_VALUE,
-                                 UNSPECIFIED,
-                                 false));
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
   /// matrix B: K * N
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
 
   /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
   // run Function
   test.run();
 }
@@ -138,22 +142,23 @@ void testFuncDDSparseMatrix(
   real beta = 1.0;
   // init Test object
   FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
 
   /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                 TensorShape{dimK, dimN},
-                                 nnz,
-                                 FORMAT,
-                                 FLOAT_VALUE,
-                                 UNSPECIFIED,
-                                 false));
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
 
   /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
   // run Function
   test.run();
 }
@@ -189,7 +194,12 @@ void testFuncSparseDDMatrix(
   real beta = 1.0;
   // init Test object
   FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@@ -198,14 +208,10 @@ void testFuncSparseDDMatrix(
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
 
   /// output sparse matrix C: M * N
-  test.addOutputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                  TensorShape{dimM, dimN},
-                                  nnz,
-                                  FORMAT,
-                                  FLOAT_VALUE,
-                                  UNSPECIFIED,
-                                  false),
-                  ADD_TO);
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      beta == 1.0 ? ADD_TO : ASSIGN_TO);
   // run Function
   test.run();
 }
-- 
GitLab