From b3be73580717f571d37bb655887d6449024d0ab7 Mon Sep 17 00:00:00 2001 From: xutianbing Date: Mon, 23 Jan 2017 15:00:36 -0800 Subject: [PATCH] Daoyuan's comments. --- paddle/function/BufferArg.h | 37 +++---- paddle/function/FunctionTest.h | 40 +++----- paddle/function/MulOp.cpp | 173 ++++++++++++++++----------------- paddle/function/MulOp.h | 40 ++++++-- paddle/function/MulOpGpu.cu | 114 +++++++--------------- paddle/function/MulOpTest.cpp | 72 +++++++------- 6 files changed, 217 insertions(+), 259 deletions(-) diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h index 7565047a570..f3634364ab2 100644 --- a/paddle/function/BufferArg.h +++ b/paddle/function/BufferArg.h @@ -71,24 +71,17 @@ public: public: BufferArg(ValueType valueType, const TensorShape& shape, - ArgType argType = UNSPECIFIED, - bool trans = false) + ArgType argType = UNSPECIFIED) : buf_(nullptr), valueType_(valueType), shape_(shape), - argType_(argType), - trans_(trans) {} + argType_(argType) {} BufferArg(void* buf, ValueType valueType, const TensorShape& shape, - ArgType argType = UNSPECIFIED, - bool trans = false) - : buf_(buf), - valueType_(valueType), - shape_(shape), - argType_(argType), - trans_(trans) {} + ArgType argType = UNSPECIFIED) + : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {} BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {} @@ -98,8 +91,7 @@ public: const_cast(reinterpret_cast(matrix.getData()))), valueType_(DataType::value), shape_(2), - argType_(argType), - trans_(matrix.isTransposed()) { + argType_(argType) { bufferType_ = TENSOR_NORMAL; shape_.setDim(0, matrix.getHeight()); shape_.setDim(1, matrix.getWidth()); @@ -112,8 +104,7 @@ public: const_cast(reinterpret_cast(matrix.getData()))), valueType_(DataType::value), shape_(shape), - argType_(argType), - trans_(matrix.isTransposed()) { + argType_(argType) { bufferType_ = TENSOR_NORMAL; CHECK_EQ(matrix.getElementCnt(), shape.getElements()); } @@ -145,7 +136,7 @@ public: // CHECK(deviceType_ == DType); CHECK_EQ((size_t)2, shape_.ndims()); return typename Tensor::Matrix( - reinterpret_cast(buf_), shape_[0], shape_[1], trans_); + reinterpret_cast(buf_), shape_[0], shape_[1]); } template @@ -169,7 +160,6 @@ public: ValueType valueType() const { return valueType_; } BufferType bufferType() const { return bufferType_; } const TensorShape& shape() const { return shape_; } - bool isTransposed() const { return trans_; } bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; } bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; } virtual size_t numElements() const { return shape_.getElements(); } @@ -183,7 +173,6 @@ protected: TensorShape shape_; BufferType bufferType_{TENSOR_UNKNOWN}; ArgType argType_{UNSPECIFIED}; - bool trans_{false}; // todo(tianbing), add deviceType_ // leading dimensions. The size is dims_.size() // Dims lds_; @@ -277,9 +266,8 @@ public: size_t nnz, SparseFormat format, SparseValueType type, - ArgType argType = UNSPECIFIED, - bool trans = false) - : BufferArg(buf, valueType, shape, argType, trans), + ArgType argType = UNSPECIFIED) + : BufferArg(buf, valueType, shape, argType), row_(row), col_(col), nnz_(nnz), @@ -302,9 +290,8 @@ public: size_t nnz, SparseFormat format, SparseValueType type, - ArgType argType = UNSPECIFIED, - bool trans = false) - : BufferArg(valueType, shape, argType, trans), + ArgType argType = UNSPECIFIED) + : BufferArg(valueType, shape, argType), /// len of row_ : height + 1 (CSR), buf_ == nullptr row_(format == SPARSE_CSR ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape[0] + 1}) @@ -343,7 +330,7 @@ public: nnz_, type_, format_, - trans_); + false); } ~SparseMatrixArg() {} diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h index 6515cba1629..baa94abffa0 100644 --- a/paddle/function/FunctionTest.h +++ b/paddle/function/FunctionTest.h @@ -64,22 +64,14 @@ public: cpuMemory_.emplace_back(std::make_shared(size)); gpuMemory_.emplace_back(std::make_shared(size)); - cpuInputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), - input.valueType(), - input.shape(), - UNSPECIFIED, - input.isTransposed())); - gpuInputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), - input.valueType(), - input.shape(), - UNSPECIFIED, - input.isTransposed())); + cpuInputs_.emplace_back(std::make_shared( + cpuMemory_.back()->getBuf(), input.valueType(), input.shape())); + gpuInputs_.emplace_back(std::make_shared( + gpuMemory_.back()->getBuf(), input.valueType(), input.shape())); } // output need only contains shape, do not contains data. - void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) { + void addOutputs(const BufferArg& output, ArgType argType = ADD_TO) { size_t size = output.shape().getElements() * sizeOfValuType(output.valueType()); cpuMemory_.emplace_back(std::make_shared(size)); @@ -89,16 +81,14 @@ public: cpuMemory_.back()->getBuf(), output.valueType(), output.shape(), - // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO - argType, - output.isTransposed())); + // todo(tianbing), argType = output.getArgType(), but default ADD_TO + argType)); gpuOutputs_.emplace_back(std::make_shared( gpuMemory_.back()->getBuf(), output.valueType(), output.shape(), - // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO - argType, - output.isTransposed())); + // todo(tianbing), argType = output.getArgType(), but default ADD_TO + argType)); } /// add and init output sparse matrix @@ -107,15 +97,13 @@ public: output.shape()[1], output.nnz(), output.dataType(), - output.dataFormat(), - output.isTransposed()); + output.dataFormat()); gpuSparse_ = std::make_shared(output.shape()[0], output.shape()[1], output.nnz(), output.dataType(), - output.dataFormat(), - output.isTransposed()); + output.dataFormat()); /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); @@ -154,15 +142,13 @@ public: input.shape()[1], input.nnz(), input.dataType(), - input.dataFormat(), - input.isTransposed()); + input.dataFormat()); gpuSparse_ = std::make_shared(input.shape()[0], input.shape()[1], input.nnz(), input.dataType(), - input.dataFormat(), - input.isTransposed()); + input.dataFormat()); /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 4d7f1a7fa92..965115121eb 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -46,21 +46,11 @@ void MulOp(CpuSparseMatrix& out, const CpuMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT) { - CHECK(!out.isTransposed()) << "Not supported"; + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { CHECK_EQ(out.getValueType(), FLOAT_VALUE); - CHECK(!a.isTransposed() || !b.isTransposed()) - << "Not support both a and b are transpose matrices"; - - size_t height = out.getHeight(); - size_t width = out.getWidth(); - size_t aRow = !a.isTransposed() ? a.getHeight() : a.getWidth(); - size_t aCol = !a.isTransposed() ? a.getWidth() : a.getHeight(); - size_t bRow = !b.isTransposed() ? b.getHeight() : b.getWidth(); - size_t bCol = !b.isTransposed() ? b.getWidth() : b.getHeight(); - /// C = A * B, for matrix format - CHECK(aCol == bRow && aRow == height && bCol == width); - if (scaleT == 0) { out.zeroMem(); } @@ -69,12 +59,14 @@ void MulOp(CpuSparseMatrix& out, real* C = out.getValue(); int* rows = out.getRows(); int* cols = out.getCols(); + size_t width = out.getWidth(); + size_t height = out.getHeight(); /// SPARSE_CSC, {a any, b not trans} if (out.getFormat() == SPARSE_CSC) { /// b not trans and a any - CHECK(!b.isTransposed()); - size_t m = !a.isTransposed() ? a.getWidth() : a.getHeight(); + CHECK(!bTrans); + size_t m = !aTrans ? a.getWidth() : a.getHeight(); for (size_t i = 0; i < width; i++) { size_t start = out.getColStartIdx(i); size_t end = out.getColStartIdx(i + 1); @@ -82,9 +74,8 @@ void MulOp(CpuSparseMatrix& out, real sum = 0; size_t rowIdx = rows[j]; for (size_t k = 0; k < m; k++) { - sum += - (!a.isTransposed() ? A[rowIdx * m + k] : A[k * height + rowIdx]) * - B[k * width + i]; + sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) * + B[k * width + i]; } C[j] = scaleAB * sum + scaleT * C[j]; } @@ -95,7 +86,7 @@ void MulOp(CpuSparseMatrix& out, /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans} if (out.getFormat() == SPARSE_CSR) { /// a and b can not both transpose - CHECK(!(a.isTransposed() && b.isTransposed())); + CHECK(!(aTrans && bTrans)); size_t m = a.getWidth(); for (size_t i = 0; i < height; i++) { size_t start = out.getRowStartIdx(i); @@ -104,9 +95,8 @@ void MulOp(CpuSparseMatrix& out, real sum = 0; size_t colIdx = cols[j]; for (size_t k = 0; k < m; k++) { - sum += - (!a.isTransposed() ? A[i * m + k] : A[k * height + i]) * - (!b.isTransposed() ? B[k * width + colIdx] : B[colIdx * m + k]); + sum += (!aTrans ? A[i * m + k] : A[k * height + i]) * + (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]); } C[j] = scaleAB * sum + scaleT * C[j]; } @@ -120,25 +110,15 @@ void MulOp(CpuMatrix& out, const CpuMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT) { - CHECK(!out.isTransposed()) << "out matrix transpose not supported"; - CBLAS_TRANSPOSE aTrans = a.isTransposed() ? CblasTrans : CblasNoTrans; - size_t aRow = a.isTransposed() ? a.getWidth() : a.getHeight(); - size_t aCol = a.isTransposed() ? a.getHeight() : a.getWidth(); - CBLAS_TRANSPOSE bTrans = b.isTransposed() ? CblasTrans : CblasNoTrans; - size_t bRow = b.isTransposed() ? b.getWidth() : b.getHeight(); - size_t bCol = b.isTransposed() ? b.getHeight() : b.getWidth(); - - /// C = A * B, for matrix format - CHECK_EQ(aCol, bRow); - CHECK_EQ(aRow, out.getHeight()); - CHECK_EQ(bCol, out.getWidth()); - - GEMM(aTrans, - bTrans, + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { + GEMM(aTrans ? CblasTrans : CblasNoTrans, + bTrans ? CblasTrans : CblasNoTrans, out.getHeight(), out.getWidth(), - aCol, + !aTrans ? a.getWidth() : a.getHeight(), scaleAB, a.getData(), a.getStride(), @@ -154,21 +134,12 @@ void MulOp(CpuMatrix& out, const CpuSparseMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT) { - CHECK(!out.isTransposed()) << "Not supported"; - CHECK(!b.isTransposed()) << "Not supported"; - CHECK(scaleT == 0 || scaleT == 1) << "Not support"; - CHECK_EQ(scaleAB, static_cast(1.0)) << "Not supported"; - CHECK_EQ(a.getFormat(), SPARSE_CSR) << "Not supported"; - - if (!a.isTransposed()) { - CHECK(b.getHeight() == a.getWidth() && a.getHeight() == out.getHeight() && - b.getWidth() == out.getWidth()); - } else { - CHECK(b.getHeight() == a.getHeight() && a.getWidth() == out.getHeight() && - b.getWidth() == out.getWidth()); - } - + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { + CHECK_EQ(a.getFormat(), SPARSE_CSR) + << "Not supported SPARSE_CSR format for a"; if (scaleT == 0) { out.zeroMem(); } @@ -185,9 +156,9 @@ void MulOp(CpuMatrix& out, const int start = a.getRowStartIdx(i); const int end = a.getRowStartIdx(i + 1); for (int j = start; j < end; ++j) { - vecAddTo(!a.isTransposed() ? out.getRow(i) : out.getRow(cols[j]), - !a.isTransposed() ? const_cast(b).getRow(cols[j]) - : const_cast(b).getRow(i), + vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]), + !aTrans ? const_cast(b).getRow(cols[j]) + : const_cast(b).getRow(i), (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0, out.getWidth()); } @@ -199,19 +170,10 @@ void MulOp(CpuMatrix& out, const CpuMatrix& a, const CpuSparseMatrix& b, real scaleAB, - real scaleT) { - CHECK(!out.trans_) << "Not supported"; - CHECK(!a.isTransposed()) << "Not supported"; - CHECK(scaleT == 0 || scaleT == 1); - CHECK_EQ(scaleAB, static_cast(1.0)); - if (!b.isTransposed()) { /// b is not Transpose - CHECK(b.getHeight() == a.getWidth() && a.getHeight() == out.getHeight() && - b.getWidth() == out.getWidth()); - } else { - CHECK(b.getHeight() == out.getWidth() && a.getHeight() == out.getHeight() && - b.getWidth() == a.getWidth()); - } - + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { if (scaleT == 0) { out.zeroMem(); } @@ -227,8 +189,8 @@ void MulOp(CpuMatrix& out, int start = b.getColStartIdx(j); int end = b.getColStartIdx(j + 1); for (int i = start; i < end; ++i) { - colVecAddTo(!b.isTransposed() ? C + j : C + rows[i], - !b.isTransposed() ? A + rows[i] : A + j, + colVecAddTo(!bTrans ? C + j : C + rows[i], + !bTrans ? A + rows[i] : A + j, (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i], out.getHeight(), out.getWidth(), @@ -244,8 +206,8 @@ void MulOp(CpuMatrix& out, int start = b.getRowStartIdx(j); int end = b.getRowStartIdx(j + 1); for (int i = start; i < end; ++i) { - colVecAddTo(!b.isTransposed() ? C + cols[i] : C + j, - !b.isTransposed() ? A + j : A + cols[i], + colVecAddTo(!bTrans ? C + cols[i] : C + j, + !bTrans ? A + j : A + cols[i], (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i], out.getHeight(), out.getWidth(), @@ -270,16 +232,43 @@ public: void init(const FuncConfig& config) override { alpha_ = config.get("scaleAB"); beta_ = config.get("scaleT"); + aTrans_ = config.get("aTrans"); + bTrans_ = config.get("bTrans"); + cTrans_ = config.get("cTrans"); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK(!cTrans_) << "output matrix should not be transposed"; + CHECK(!aTrans_ || !bTrans_) + << "Not support both a and b are transpose matrices"; + CHECK_EQ((size_t)2, inputs.size()); CHECK_EQ((size_t)1, outputs.size()); CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data()); CHECK_EQ(inputs[0].shape().ndims(), (size_t)2); CHECK_EQ(inputs[1].shape().ndims(), (size_t)2); CHECK_EQ(outputs[0].shape().ndims(), (size_t)2); - CHECK_EQ(outputs[0].getArgType(), ADD_TO); + + size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1]; + size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0]; + size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1]; + size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0]; + /// C = A * B, or C += A * B, for matrix format + CHECK_EQ(aCol, bRow); + CHECK_EQ(aRow, outputs[0].shape()[0]); + CHECK_EQ(bCol, outputs[0].shape()[1]); + + /// only support C = A * B or C += A * B + CHECK_EQ(alpha_, static_cast(1.0)); + CHECK((beta_ == 0 && outputs[0].getArgType() == ASSIGN_TO) || + (beta_ == 1 && outputs[0].getArgType() == ADD_TO)); + + /// support dense = not both sparse * sparse + /// or sparse = dense * dense + CHECK((!outputs[0].isSparseArg() && + !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) || + (outputs[0].isSparseArg() && !inputs[0].isSparseArg() && + !inputs[1].isSparseArg())); auto outMat = outputs[0].matrix(); /// matrix = matrix * matrix @@ -289,29 +278,40 @@ public: inputs[0].matrix(), inputs[1].matrix(), alpha_, - beta_); + beta_, + aTrans_, + bTrans_, + cTrans_); return; } /// matrix = matrix * sparse matrix if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() && !outputs[0].isSparseArg()) { + CHECK(!aTrans_) << "Not supported a transpose"; MulOp(outMat, inputs[0].matrix(), inputs[1].sparse().SparseMatrix(), alpha_, - beta_); + beta_, + aTrans_, + bTrans_, + cTrans_); return; } /// matrix = sparse matrix * matrix if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() && !outputs[0].isSparseArg()) { + CHECK(!bTrans_) << "Not supported b transpose"; MulOp(outMat, inputs[0].sparse().SparseMatrix(), inputs[1].matrix(), alpha_, - beta_); + beta_, + aTrans_, + bTrans_, + cTrans_); return; } @@ -319,18 +319,14 @@ public: auto outSparseMat = outputs[0].sparse().SparseMatrix(); if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() && outputs[0].isSparseArg()) { - /* - LOG(INFO) << "input0"; - inputs[0].matrix().print(std::cout); - LOG(INFO) << "input1"; - inputs[1].matrix().print(std::cout); - LOG(INFO) << "output sparse matrix"; - outSparseMat.print(std::cout); */ MulOp(outSparseMat, inputs[0].matrix(), inputs[1].matrix(), alpha_, - beta_); + beta_, + aTrans_, + bTrans_, + cTrans_); return; } } @@ -338,6 +334,9 @@ public: private: real alpha_; real beta_; + bool aTrans_; + bool bTrans_; + bool cTrans_; }; REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc); diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h index fda5b092498..a7703482255 100644 --- a/paddle/function/MulOp.h +++ b/paddle/function/MulOp.h @@ -26,55 +26,79 @@ void MulOp(CpuMatrix& out, const CpuMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(CpuMatrix& out, const CpuSparseMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(CpuMatrix& out, const CpuMatrix& a, const CpuSparseMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(CpuSparseMatrix& out, const CpuMatrix& a, const CpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(GpuMatrix& out, const GpuMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(GpuMatrix& out, const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(GpuMatrix& out, const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); template void MulOp(GpuSparseMatrix& out, const GpuMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT); + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans); } // namespace paddle diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index 09d2a764911..94bee72034f 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -27,38 +27,22 @@ void MulOp(GpuMatrix& out, const GpuMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT) { - CHECK(!out.isTransposed()) << "Transpose not supported for out matrix"; - if (!a.isTransposed() && !b.isTransposed()) { - /// a : M * K, b: K * N - CHECK(out.getWidth() == b.getWidth() && - out.getHeight() == a.getHeight() && - a.getWidth() == b.getHeight()); - } else if (a.isTransposed() && !b.isTransposed()) { - /// a : K * M, b : K * N - CHECK(out.getWidth() == b.getWidth() && - out.getHeight() == a.getWidth() && - a.getHeight() == b.getHeight()); - } else if (!a.isTransposed() && b.isTransposed()) { - /// a: M * K, b : N * K - CHECK(out.getWidth() == b.getHeight() && - out.getHeight() == a.getHeight() && - a.getWidth() == b.getWidth()); - } else { - LOG(FATAL) << "Not support for both a and b are Transposed Matrices"; - } - + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { + CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match"; real* aData = const_cast(a.getData()); real* bData = const_cast(b.getData()); real* outData = const_cast(out.getData()); hl_matrix_mul(aData, - !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T, + !aTrans ? HPPL_OP_N : HPPL_OP_T, bData, - !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T, + !bTrans ? HPPL_OP_N : HPPL_OP_T, outData, out.getHeight(), out.getWidth(), - !a.isTransposed() ? a.getWidth() : a.getHeight(), + !aTrans ? a.getWidth() : a.getHeight(), scaleAB, scaleT, a.getStride(), @@ -75,27 +59,19 @@ void MulOp(GpuMatrix& out, const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT) { + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { CHECK(out.isContiguous()); CHECK(b.isContiguous()); - CHECK(b.useGpu_) << "Matrix type are not equal"; - CHECK(!out.isTransposed() && !b.isTransposed()) << "not supported"; - if (!a.isTransposed()) { - /// a: M * K, b: K * N - CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getHeight() - && a.getWidth() == b.getHeight()) << "Matrix dimensions are not equal"; - } else { - /// a: K * M, transpose, b: K * N - CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getWidth() - && a.getHeight() == b.getHeight()) << "Matrix dimensions are not equal"; - } + CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match"; - hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N; hl_sparse_matrix_s aData = a.sMatrix_.get(); real* bData = const_cast(b.getData()); real* outData = const_cast(out.getData()); hl_matrix_csr_mul_dense(aData, - aTrans, + aTrans ? HPPL_OP_T : HPPL_OP_N, bData, HPPL_OP_N, outData, @@ -115,25 +91,14 @@ void MulOp(GpuMatrix& out, const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB, - real scaleT) { + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { CHECK(out.isContiguous()); CHECK(a.isContiguous()); - CHECK(a.useGpu_) << "Matrix type are not equal"; - if (!b.isTransposed()) { - /// a : M * K, b : K * N - CHECK(out.getWidth() == b.getWidth() && - out.getHeight() == a.getHeight() && - a.getWidth() == b.getHeight()) - << "Matrix dimensions are not equal"; - } else { - /// a : M * K, b : N * K, transpose - CHECK(out.getWidth() == b.getHeight() && - out.getHeight() == a.getHeight() && - a.getWidth() == b.getWidth()) - << "Matrix dimensions are not equal"; - } + CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match"; - hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N; hl_sparse_matrix_s bData = b.sMatrix_.get(); real* aData = const_cast(a.getData()); real* outData = const_cast(out.getData()); @@ -142,7 +107,7 @@ void MulOp(GpuMatrix& out, hl_matrix_dense_mul_csc(aData, HPPL_OP_N, bData, - bTrans, + bTrans ? HPPL_OP_T : HPPL_OP_N, outData, out.getHeight(), out.getWidth(), @@ -153,7 +118,7 @@ void MulOp(GpuMatrix& out, hl_matrix_dense_mul_csr(aData, HPPL_OP_N, bData, - bTrans, + bTrans ? HPPL_OP_T : HPPL_OP_N, outData, out.getHeight(), out.getWidth(), @@ -168,35 +133,26 @@ void MulOp(GpuSparseMatrix& out, const GpuMatrix& a, const GpuMatrix& b, real scaleAB, - real scaleT) { + real scaleT, + bool aTrans, + bool bTrans, + bool cTrans) { CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match"; - CHECK(!out.isTransposed()) << "Transpose is not supported for out matrix"; - - if (!a.isTransposed() && !b.isTransposed()) { - CHECK(out.getHeight() == a.getHeight() && - out.getWidth() == b.getWidth() && - a.getWidth() == b.getHeight()); - } else if (a.isTransposed() && !b.isTransposed()) { - CHECK(out.getHeight() == a.getWidth() && - out.getWidth() == b.getWidth() && - a.getHeight() == b.getHeight()); - } else if (!a.isTransposed() && b.isTransposed()) { - CHECK(out.getHeight() == a.getHeight() && - out.getWidth() == b.getHeight() && - a.getWidth() == b.getWidth()); - } else { - LOG(FATAL) << "Not support for both a and b are Transposed Matrices"; - } - hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N; - hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N; - int dimK = !b.isTransposed() ? b.getHeight() : b.getWidth(); real* aData = const_cast(a.getData()); real* bData = const_cast(b.getData()); hl_sparse_matrix_s outData = out.sMatrix_.get(); - hl_sparse_matrix_mul(aData, aTrans, bData, bTrans, outData, - out.getHeight(), out.getWidth(), dimK, scaleAB, scaleT); + hl_sparse_matrix_mul(aData, + aTrans ? HPPL_OP_T : HPPL_OP_N, + bData, + bTrans ? HPPL_OP_T : HPPL_OP_N, + outData, + out.getHeight(), + out.getWidth(), + !bTrans ? b.getHeight() : b.getWidth(), + scaleAB, + scaleT); } } // namespace paddle diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp index 05460c80970..f67fa41612c 100644 --- a/paddle/function/MulOpTest.cpp +++ b/paddle/function/MulOpTest.cpp @@ -39,18 +39,21 @@ void testFuncDDDMatrix( size_t widthC = dimN; // init Test object FunctionCompare test("MulOp", - FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + FuncConfig() + .set("scaleAB", alpha) + .set("scaleT", beta) + .set("aTrans", transa) + .set("bTrans", transb) + .set("cTrans", false)); // prepare input arguments /// matrix A : HA * WA - test.addInputs(BufferArg( - VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}, UNSPECIFIED, transa)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA})); /// matrix B: HB * WB - test.addInputs(BufferArg( - VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}, UNSPECIFIED, transb)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB})); /// output matrix C: HC * WC test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}), - ADD_TO); + beta == 1.0 ? ADD_TO : ASSIGN_TO); // run Function test.run(); } @@ -88,21 +91,22 @@ void testFuncDSparseDMatrix( real beta = 1.0; // init Test object FunctionCompare test("MulOp", - FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + FuncConfig() + .set("scaleAB", alpha) + .set("scaleT", beta) + .set("aTrans", false) + .set("bTrans", false) + .set("cTrans", false)); // prepare input arguments /// sparse matrix A : M * K - test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT, - TensorShape{dimM, dimK}, - nnz, - FORMAT, - FLOAT_VALUE, - UNSPECIFIED, - false)); + test.addInputs(SparseMatrixArg( + VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE)); /// matrix B: K * N test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN})); /// output matrix C: M * N - test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), + beta == 1.0 ? ADD_TO : ASSIGN_TO); // run Function test.run(); } @@ -138,22 +142,23 @@ void testFuncDDSparseMatrix( real beta = 1.0; // init Test object FunctionCompare test("MulOp", - FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + FuncConfig() + .set("scaleAB", alpha) + .set("scaleT", beta) + .set("aTrans", false) + .set("bTrans", false) + .set("cTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); /// matrix B: K * N - test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT, - TensorShape{dimK, dimN}, - nnz, - FORMAT, - FLOAT_VALUE, - UNSPECIFIED, - false)); + test.addInputs(SparseMatrixArg( + VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE)); /// output matrix C: M * N - test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), + beta == 1.0 ? ADD_TO : ASSIGN_TO); // run Function test.run(); } @@ -189,7 +194,12 @@ void testFuncSparseDDMatrix( real beta = 1.0; // init Test object FunctionCompare test("MulOp", - FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + FuncConfig() + .set("scaleAB", alpha) + .set("scaleT", beta) + .set("aTrans", false) + .set("bTrans", false) + .set("cTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); @@ -198,14 +208,10 @@ void testFuncSparseDDMatrix( test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN})); /// output sparse matrix C: M * N - test.addOutputs(SparseMatrixArg(VALUE_TYPE_FLOAT, - TensorShape{dimM, dimN}, - nnz, - FORMAT, - FLOAT_VALUE, - UNSPECIFIED, - false), - ADD_TO); + test.addOutputs( + SparseMatrixArg( + VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE), + beta == 1.0 ? ADD_TO : ASSIGN_TO); // run Function test.run(); } -- GitLab