diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 85f7f535dcfa4262f4a4ef62213deaf23858d5cc..37f8808605e10c7c0e6f88f6fec7b5f20697fbaf 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -498,15 +498,10 @@ public: CHECK_EQ(outputs[0].shape().ndims(), (size_t)2); CHECK_EQ(outputs[0].getArgType(), ADD_TO); - /// todo(tianbing), support SparseMatrixArg for out_mat auto out_mat = outputs[0].matrix(); - LOG(INFO) << "out_mat:"; - out_mat.print(std::cout); - if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg()) { - LOG(INFO) << "in1_mat:"; - inputs[0].matrix().print(std::cout); - LOG(INFO) << "in2_mat:"; - inputs[1].matrix().print(std::cout); + /// matrix = matrix * matrix + if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() && + !outputs[0].isSparseArg()) { MulOp(out_mat, inputs[0].matrix(), inputs[1].matrix(), @@ -515,11 +510,9 @@ public: return; } - if (!inputs[0].isSparseArg() && inputs[1].isSparseArg()) { - LOG(INFO) << "in1_mat:"; - inputs[0].matrix().print(std::cout); - LOG(INFO) << "in2_mat:"; - inputs[1].sparse().SparseMatrix().print(std::cout); + /// matrix = matrix * sparse matrix + if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() && + !outputs[0].isSparseArg()) { MulOp(out_mat, inputs[0].matrix(), inputs[1].sparse().SparseMatrix(), @@ -528,11 +521,9 @@ public: return; } - if (inputs[0].isSparseArg() && !inputs[1].isSparseArg()) { - LOG(INFO) << "in1_mat:"; - inputs[0].sparse().SparseMatrix().print(std::cout); - LOG(INFO) << "in2_mat:"; - inputs[1].matrix().print(std::cout); + /// matrix = sparse matrix * matrix + if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() && + !outputs[0].isSparseArg()) { MulOp(out_mat, inputs[0].sparse().SparseMatrix(), inputs[1].matrix(), @@ -540,6 +531,18 @@ public: beta_); return; } + + /// sparse matrix = matrix * matrix + auto out_sparse_mat = outputs[0].sparse().SparseMatrix(); + if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() && + outputs[0].isSparseArg()) { + MulOp(out_sparse_mat, + inputs[0].matrix(), + inputs[1].matrix(), + alpha_, + beta_); + return; + } } private: diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index 3691c7f3206126e5d40fec915edc5bad80487b14..3c4654b9b27574fd146d27e66164254f5e40da7d 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -176,7 +176,36 @@ void MulOp(GpuSparseMatrix& out, const GpuMatrix& b, real scale_ab, real scale_t) { -/// todo(tianbing), implement it + /// todo(tianbing), clean the code + CHECK(a.useGpu_ && b.useGpu_) << "type not match"; + CHECK(!out.trans_) << "trans not supported"; + real* a_data = const_cast(a.getData()); + real* b_data = const_cast(b.getData()); + hl_sparse_matrix_s out_data = out.sMatrix_.get(); + hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N; + hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N; + + if (!a.trans_ && !b.trans_) { + CHECK(out.height_ == a.getHeight()); + CHECK(out.width_ == b.getWidth()); + CHECK(a.getWidth() == b.getHeight()); + } else if (a.trans_ && !b.trans_) { + CHECK(out.height_ == a.getWidth()); + CHECK(out.width_ == b.getWidth()); + CHECK(a.getHeight() == b.getHeight()); + } else if (!a.trans_ && b.trans_) { + CHECK(out.height_ == a.getHeight()); + CHECK(out.width_ == b.getHeight()); + CHECK(a.getWidth() == b.getWidth()); + } else { + LOG(INFO) << "Not support"; + } + int dim_m = out.height_; + int dim_n = out.width_; + int dim_k = !b.trans_ ? b.getHeight() : b.getWidth(); + hl_sparse_matrix_mul( + a_data, a_trans, b_data, b_trans, out_data, + dim_m, dim_n, dim_k, scale_ab, scale_t); } } // namespace paddle diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp index fd02504678efd62912788a78b017ffb6d6530f78..630070b845a9af7aff734ea3e8ff9a7cf62fd7d3 100644 --- a/paddle/function/MulOpTest.cpp +++ b/paddle/function/MulOpTest.cpp @@ -24,9 +24,10 @@ limitations under the License. */ using namespace paddle; // NOLINT /** - * C = alpha * C + beta * (A * B) + * C = alpha * C + beta * (A * B), A, B, C dense matrix + * dense = dense * dense */ -void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { +void testDDDMatrix(bool transa, bool transb, int dimM, int dimN, int dimK) { real alpha = 1.5; real beta = 2.0; @@ -73,7 +74,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { autotest::TensorCheckErr(*cpuC, *gpuC); } -TEST(Matrix, mul) { +TEST(Matrix, DDDMul) { LOG(INFO) << "test for dense = dense * dense matrix"; for (auto transa : {false, true}) { for (auto transb : {false, true}) { @@ -89,7 +90,7 @@ TEST(Matrix, mul) { << " dimN=" << std::setw(5) << dimN << " dimK=" << std::setw(5) << dimK; - testMatrixMul(transa, transb, dimM, dimN, dimK); + testDDDMatrix(transa, transb, dimM, dimN, dimK); } } } @@ -97,19 +98,100 @@ TEST(Matrix, mul) { } } -struct MatrixPara { - size_t height; - size_t width; - bool trans; - bool sparse; - size_t nnz; - SparseFormat format; -}; +/** + * C += A * B, B, C dense, A sparse + * dense = sparse * dense + */ +void testDSparseDMatrix( + size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { + real alpha = 1.0; + real beta = 1.0; + const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU"); + cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU"); + gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + + CpuSparseMatrix cpuMatrixA(dimM, dimK, nnz, FLOAT_VALUE, FORMAT, false); + GpuSparseMatrix gpuMatrixA(dimM, dimK, nnz, FLOAT_VALUE, FORMAT, false); + CpuMatrix cpuDenseA(dimM, dimK, false); + + auto cpuMatrixB = Matrix::create(dimK, dimN, false, false); + auto gpuMatrixB = Matrix::create(dimK, dimN, false, true); + auto cpuDenseB = Matrix::create(dimK, dimN, false, false); + + auto cpuMatrixC = Matrix::create(dimM, dimN, false, false); + auto gpuMatrixC = Matrix::create(dimM, dimN, false, true); + auto cpuDenseC = Matrix::create(dimM, dimN, false, false); + + /*matrix init*/ + hl_stream_t stream(HPPL_STREAM_1); + cpuMatrixA.randomizeUniform(); + cpuMatrixB->randomizeUniform(); + cpuMatrixC->randomizeUniform(); + + gpuMatrixA.copyFrom(cpuMatrixA, stream); + gpuMatrixB->copyFrom(*cpuMatrixB, stream); + gpuMatrixC->copyFrom(*cpuMatrixC, stream); + + cpuDenseA.copyFrom(cpuMatrixA); + cpuDenseB->copyFrom(*cpuMatrixB); + cpuDenseC->copyFrom(*cpuMatrixC); + hl_stream_synchronize(stream); + + /*matrix mul*/ + BufferArgs cpuInputs; + BufferArgs cpuOutputs; + cpuInputs.addArg(cpuMatrixA); + cpuInputs.addArg(*cpuMatrixB); + cpuOutputs.addArg(*cpuMatrixC, ADD_TO); + cpuFunc->calc(cpuInputs, cpuOutputs); + + BufferArgs gpuInputs; + BufferArgs gpuOutputs; + gpuInputs.addArg(gpuMatrixA); + gpuInputs.addArg(*gpuMatrixB); + gpuOutputs.addArg(*gpuMatrixC, ADD_TO); + gpuFunc->calc(gpuInputs, gpuOutputs); + + BufferArgs denseInputs; + BufferArgs denseOutputs; + denseInputs.addArg(cpuDenseA); + denseInputs.addArg(*cpuDenseB); + denseOutputs.addArg(*cpuDenseC, ADD_TO); + cpuFunc->calc(denseInputs, denseOutputs); + + /*check result*/ + autotest::TensorCheckErr(*cpuMatrixC, *cpuDenseC); + autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC); +} + +TEST(Matrix, DSparseDMul) { + LOG(INFO) << "test for dense = sparse * dense matrix"; + for (const auto dimM : {10, 100, 1000}) { + for (const auto dimN : {10, 100}) { + for (const auto dimK : {3, 10}) { + for (const auto nnz : {3, 10}) { + for (const auto FORMAT : {SPARSE_CSR}) { + VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ') + << " dimM=" << std::setw(5) << dimM + << " dimN=" << std::setw(5) << dimN + << " dimK=" << std::setw(5) << dimK + << " nnz=" << std::setw(5) << nnz + << " format=" << std::setw(5) << FORMAT; + testDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT); + } + } + } + } + } +} /** * C += A * B, A, C dense, B sparse + * dense = dense * sparse */ -void testDSparseDMatrix() { +void testDDSparseMatrix( + size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real alpha = 1.0; real beta = 1.0; const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU"); @@ -117,46 +199,19 @@ void testDSparseDMatrix() { const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU"); gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); - constexpr size_t dimM = 2; - constexpr size_t dimN = 2; - constexpr size_t dimK = 3; - constexpr size_t NNZ = 3; - constexpr SparseFormat FORMAT = SPARSE_CSC; - - MatrixPara paraA{dimM, dimK, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT}; - MatrixPara paraB{dimK, dimN, /*trans*/ false, /*sparse*/ true, NNZ, FORMAT}; - MatrixPara paraC{dimM, dimN, /*trans*/ false, /*sparse*/ false, NNZ, FORMAT}; - - auto cpuMatrixA = - Matrix::create(paraA.height, paraA.width, paraA.trans, false); - auto gpuMatrixA = - Matrix::create(paraA.height, paraA.width, paraA.trans, true); - auto cpuDenseA = - Matrix::create(paraA.height, paraA.width, paraA.trans, false); - CpuSparseMatrix cpuMatrixB(paraB.height, - paraB.width, - paraB.nnz, - FLOAT_VALUE, - paraB.format, - paraB.trans); - - GpuSparseMatrix gpuMatrixB(paraB.height, - paraB.width, - paraB.nnz, - FLOAT_VALUE, - paraB.format, - paraB.trans); - - auto cpuDenseB = - Matrix::create(paraB.height, paraB.width, paraB.trans, false); - auto cpuMatrixC = - Matrix::create(paraC.height, paraC.width, paraC.trans, false); - auto gpuMatrixC = - Matrix::create(paraC.height, paraC.width, paraC.trans, true); - auto cpuDenseC = - Matrix::create(paraC.height, paraC.width, paraC.trans, false); - auto gpuMatrixC_d2h = - Matrix::create(paraC.height, paraC.width, paraC.trans, false); + auto cpuMatrixA = Matrix::create(dimM, dimK, false, false); + auto gpuMatrixA = Matrix::create(dimM, dimK, false, true); + auto cpuDenseA = Matrix::create(dimM, dimK, false, false); + + CpuSparseMatrix cpuMatrixB(dimK, dimN, nnz, FLOAT_VALUE, FORMAT, false); + + GpuSparseMatrix gpuMatrixB(dimK, dimN, nnz, FLOAT_VALUE, FORMAT, false); + + auto cpuDenseB = Matrix::create(dimK, dimN, false, false); + auto cpuMatrixC = Matrix::create(dimM, dimN, false, false); + auto gpuMatrixC = Matrix::create(dimM, dimN, false, true); + auto cpuDenseC = Matrix::create(dimM, dimN, false, false); + /*matrix init*/ hl_stream_t stream(HPPL_STREAM_1); cpuMatrixA->randomizeUniform(); @@ -172,27 +227,6 @@ void testDSparseDMatrix() { cpuDenseC->copyFrom(*cpuMatrixC); hl_stream_synchronize(stream); - LOG(INFO) << "cpuMatrixA: "; - cpuMatrixA->print(std::cout); - LOG(INFO) << "cpuMatrixB: "; - (&cpuMatrixB)->print(std::cout); - LOG(INFO) << "cpuMatrixC: "; - cpuMatrixC->print(std::cout); - - LOG(INFO) << "cpuDenseA: "; - cpuDenseA->print(std::cout); - LOG(INFO) << "cpuDenseB: "; - cpuDenseB->print(std::cout); - LOG(INFO) << "cpuDenseC: "; - cpuDenseC->print(std::cout); - - LOG(INFO) << "gpuMatrixA: "; - gpuMatrixA->print(std::cout); - LOG(INFO) << "gpuMatrixB: "; - (&gpuMatrixB)->print(std::cout); - LOG(INFO) << "gpuMatrixC: "; - gpuMatrixC->print(std::cout); - /*matrix mul*/ BufferArgs cpuInputs; BufferArgs cpuOutputs; @@ -215,15 +249,120 @@ void testDSparseDMatrix() { denseOutputs.addArg(*cpuDenseC, ADD_TO); cpuFunc->calc(denseInputs, denseOutputs); - gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream); - hl_stream_synchronize(stream); /*check result*/ - // autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC); - checkMatrixEqual(cpuMatrixC, cpuDenseC); - checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h); + autotest::TensorCheckErr(*cpuMatrixC, *cpuDenseC); + autotest::TensorCheckErr(*cpuMatrixC, *gpuMatrixC); } -TEST(Matrix, SparseMatrixMul) { +TEST(Matrix, DDSparseMul) { LOG(INFO) << "test for dense = dense * sparse matrix"; - testDSparseDMatrix(); + for (const auto dimM : {10, 100, 1000}) { + for (const auto dimN : {10, 100}) { + for (const auto dimK : {3, 10}) { + for (const auto nnz : {3, 10}) { + for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) { + VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ') + << " dimM=" << std::setw(5) << dimM + << " dimN=" << std::setw(5) << dimN + << " dimK=" << std::setw(5) << dimK + << " nnz=" << std::setw(5) << nnz + << " format=" << std::setw(5) << FORMAT; + testDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT); + } + } + } + } + } +} + +/** + * C += A * B, A sparse, B, C dense + * sparse = dense * dense + */ +void testSparseDDMatrix( + size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { + real alpha = 1.0; + real beta = 1.0; + const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU"); + cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU"); + gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); + + auto cpuMatrixA = Matrix::create(dimM, dimK, false, false); + auto gpuMatrixA = Matrix::create(dimM, dimK, false, true); + auto cpuDenseA = Matrix::create(dimM, dimK, false, false); + + auto cpuMatrixB = Matrix::create(dimK, dimN, false, false); + auto gpuMatrixB = Matrix::create(dimK, dimN, false, true); + auto cpuDenseB = Matrix::create(dimK, dimN, false, false); + + CpuSparseMatrix cpuMatrixC(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false); + CpuSparseMatrix gpuMatrixC_d2h(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false); + GpuSparseMatrix gpuMatrixC(dimM, dimN, nnz, FLOAT_VALUE, FORMAT, false); + CpuMatrix cpuDenseC(dimM, dimN, false); + + /*matrix init*/ + hl_stream_t stream(HPPL_STREAM_1); + cpuMatrixA->randomizeUniform(); + cpuMatrixB->randomizeUniform(); + cpuMatrixC.randomizeUniform(); + + gpuMatrixA->copyFrom(*cpuMatrixA, stream); + gpuMatrixB->copyFrom(*cpuMatrixB, stream); + gpuMatrixC.copyFrom(cpuMatrixC, stream); + + cpuDenseA->copyFrom(*cpuMatrixA); + cpuDenseB->copyFrom(*cpuMatrixB); + cpuDenseC.copyFrom(cpuMatrixC); + hl_stream_synchronize(stream); + + /*matrix mul*/ + BufferArgs cpuInputs; + BufferArgs cpuOutputs; + cpuInputs.addArg(*cpuMatrixA); + cpuInputs.addArg(*cpuMatrixB); + cpuOutputs.addArg(cpuMatrixC, ADD_TO); + cpuFunc->calc(cpuInputs, cpuOutputs); + + BufferArgs gpuInputs; + BufferArgs gpuOutputs; + gpuInputs.addArg(*gpuMatrixA); + gpuInputs.addArg(*gpuMatrixB); + gpuOutputs.addArg(gpuMatrixC, ADD_TO); + gpuFunc->calc(gpuInputs, gpuOutputs); + + BufferArgs denseInputs; + BufferArgs denseOutputs; + denseInputs.addArg(*cpuDenseA); + denseInputs.addArg(*cpuDenseB); + denseOutputs.addArg(cpuDenseC, ADD_TO); + cpuFunc->calc(denseInputs, denseOutputs); + + gpuMatrixC_d2h.copyFrom(gpuMatrixC, stream); + hl_stream_synchronize(stream); + + /*check result*/ + checkSMatrixEqual(cpuMatrixC, gpuMatrixC_d2h); + checkSMatrixEqual2Dense(cpuMatrixC, cpuDenseC); +} + +TEST(Matrix, SparseDDMul) { + LOG(INFO) << "test for sparse = dense * dense matrix"; + for (const auto dimM : {10, 100, 1000}) { + for (const auto dimN : {10, 100}) { + for (const auto dimK : {3, 10}) { + for (const auto nnz : {3, 10}) { + for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) { + VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ') + << " dimM=" << std::setw(5) << dimM + << " dimN=" << std::setw(5) << dimN + << " dimK=" << std::setw(5) << dimK + << " nnz=" << std::setw(5) << nnz + << " format=" << std::setw(5) << FORMAT; + testSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT); + } + } + } + } + } } diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp index 720a035ecbd26df01fe24c991982bbf7965ccbdc..3bae6d373f240fcc773644386b290ef9874828ae 100644 --- a/paddle/math/SparseMatrix.cpp +++ b/paddle/math/SparseMatrix.cpp @@ -177,7 +177,6 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix); sMatrix_ = tmp2; } - LOG(INFO) << "weight to matrix "; } } diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h index 9aa74b15193723970d80b5d1a4e0ac95341cd45a..47f461474622d13ea2f922a77348c78b450ec37f 100644 --- a/paddle/math/tests/test_matrixUtil.h +++ b/paddle/math/tests/test_matrixUtil.h @@ -30,6 +30,17 @@ void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) { } } +void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) { + ASSERT_EQ(a.getWidth(), b.getWidth()); + ASSERT_EQ(a.getHeight(), b.getHeight()); + ASSERT_EQ(a.isTransposed(), b.isTransposed()); + ASSERT_EQ(a.getFormat(), b.getFormat()); + ASSERT_EQ(a.getElementCnt(), b.getElementCnt()); + for (size_t r = 0; r < a.getElementCnt(); ++r) { + ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]); + } +} + void checkSMatrixEqual(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) { ASSERT_EQ(a->getWidth(), b->getWidth()); @@ -73,6 +84,36 @@ void checkSMatrixEqual2(const CpuSparseMatrixPtr& a, } } +void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) { + ASSERT_EQ(a.getWidth(), b.getWidth()); + ASSERT_EQ(a.getHeight(), b.getHeight()); + ASSERT_EQ(a.isTransposed(), b.isTransposed()); + + if (a.getFormat() == SPARSE_CSC) { + int* rows = a.getRows(); + for (size_t i = 0; i < a.getWidth(); i++) { + for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) { + if (a.getValueType() == FLOAT_VALUE) { + ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i)); + } else { + ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i)); + } + } + } + } else { + int* cols = a.getCols(); + for (size_t i = 0; i < a.getHeight(); i++) { + for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) { + if (a.getValueType() == FLOAT_VALUE) { + ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j])); + } else { + ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j])); + } + } + } + } +} + void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a, const CpuMatrixPtr& b) { ASSERT_EQ(a->getWidth(), b->getWidth());