diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp index 4064daf4159e2277942f58b071a44f7cbb33c14f..5d595deb12c6c8ea419dd1f31b3c131a2f6a587a 100644 --- a/paddle/function/BufferArg.cpp +++ b/paddle/function/BufferArg.cpp @@ -32,16 +32,14 @@ const SparseMatrixArg& BufferArg::sparse() const { SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType) : BufferArg(sparse, argType), row_(reinterpret_cast(sparse.getRows()), VALUE_TYPE_INT32), - col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32), - trans_(const_cast(sparse).getTranspose()) { + col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32) { bufferType_ = TENSOR_SPARSE; } SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType) : BufferArg(sparse, argType), row_(reinterpret_cast(sparse.getRows()), VALUE_TYPE_INT32), - col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32), - trans_(const_cast(sparse).getTranspose()) { + col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32) { bufferType_ = TENSOR_SPARSE; } diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h index 1f86f49911c484455931810dc1f3264e7d8a9b55..2da1115ec968ccf8064e130b2b4b805198147cb0 100644 --- a/paddle/function/BufferArg.h +++ b/paddle/function/BufferArg.h @@ -98,7 +98,8 @@ public: const_cast(reinterpret_cast(matrix.getData()))), valueType_(DataType::value), shape_(2), - argType_(argType) { + argType_(argType), + trans_(matrix.isTransposed()) { bufferType_ = TENSOR_NORMAL; shape_.setDim(0, matrix.getHeight()); shape_.setDim(1, matrix.getWidth()); @@ -111,7 +112,8 @@ public: const_cast(reinterpret_cast(matrix.getData()))), valueType_(DataType::value), shape_(shape), - argType_(argType) { + argType_(argType), + trans_(matrix.isTransposed()) { bufferType_ = TENSOR_NORMAL; CHECK_EQ(matrix.getElementCnt(), shape.getElements()); } @@ -143,7 +145,7 @@ public: // CHECK(deviceType_ == DType); CHECK_EQ((size_t)2, shape_.ndims()); return typename Tensor::Matrix( - reinterpret_cast(buf_), shape_[0], shape_[1]); + reinterpret_cast(buf_), shape_[0], shape_[1], trans_); } template @@ -179,6 +181,7 @@ protected: TensorShape shape_; BufferType bufferType_{TENSOR_UNKNOWN}; ArgType argType_{UNSPECIFIED}; + bool trans_{false}; // leading dimensions. The size is dims_.size() // Dims lds_; }; @@ -271,15 +274,13 @@ public: size_t nnz, SparseDataFormat format, SparseDataType type, - bool trans = false, ArgType argType = UNSPECIFIED) : BufferArg(buf, valueType, shape, argType), row_(row), col_(col), nnz_(nnz), format_(format), - type_(type), - trans_(trans) { + type_(type) { bufferType_ = TENSOR_SPARSE; CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE)); CHECK_EQ(shape_.ndims(), (size_t)2); @@ -322,8 +323,6 @@ public: size_t nnz() const { return nnz_; } - bool isTranspose() const { return trans_; } - SparseDataFormat dataFormat() const { return format_; } SparseDataType dataType() const { return type_; } @@ -334,8 +333,6 @@ private: size_t nnz_; SparseDataFormat format_; SparseDataType type_; - /// todo(tianbing), move trans_ up to BufferArg - bool trans_; }; } // namespace paddle diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 7d341182523cbb4508bf13ddc0f9bbbf46752151..1c593bb083e009f430fa6d4802c88bdda559b9f7 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -483,8 +483,8 @@ template class MulFunc : public FunctionBase { public: void init(const FuncConfig& config) override { - scaleAB_ = config.get("scaleAB"); - scaleT_ = config.get("scaleT"); + alpha_ = config.get("scaleAB"); + beta_ = config.get("scaleT"); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { @@ -494,7 +494,7 @@ public: CHECK_EQ(inputs[0].shape().ndims(), (size_t)2); CHECK_EQ(inputs[1].shape().ndims(), (size_t)2); CHECK_EQ(outputs[0].shape().ndims(), (size_t)2); - CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + CHECK_EQ(outputs[0].getArgType(), ADD_TO); auto in1_mat = inputs[0].matrix(); if (inputs[0].isSparseArg()) { @@ -505,12 +505,12 @@ public: in2_mat = inputs[1].sparse().SparseMatrix(); } auto out_mat = outputs[0].matrix(); - MulOp(out_mat, in1_mat, in2_mat, scaleAB_, scaleT_); + MulOp(out_mat, in1_mat, in2_mat, alpha_, beta_); } private: - real scaleAB_; - real scaleT_; + real alpha_; + real beta_; }; REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc); diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h index f3699f8c78cda71154a65b34a00dc1986bd4c221..b7b1f56af10375b95b7e1350c406fa68d422c1e1 100644 --- a/paddle/function/MulOp.h +++ b/paddle/function/MulOp.h @@ -68,4 +68,11 @@ void MulOp(GpuMatrix& out, real scaleAB, real scaleT); +template +void MulOp(GpuSparseMatrix& out, + const GpuMatrix& a, + const GpuMatrix& b, + real scaleAB, + real scaleT); + } // namespace paddle diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu index 73d788a4743326ab06392aa582b85c0c0ce75b2b..3691c7f3206126e5d40fec915edc5bad80487b14 100644 --- a/paddle/function/MulOpGpu.cu +++ b/paddle/function/MulOpGpu.cu @@ -170,4 +170,13 @@ void MulOp(GpuMatrix& out, } } +template <> +void MulOp(GpuSparseMatrix& out, + const GpuMatrix& a, + const GpuMatrix& b, + real scale_ab, + real scale_t) { +/// todo(tianbing), implement it +} + } // namespace paddle diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp index ce9d37d664710c8f434bc7e9268198644f52aa5b..3229193660ea038c53e0f95c17edf1fcbe065f8e 100644 --- a/paddle/function/MulOpTest.cpp +++ b/paddle/function/MulOpTest.cpp @@ -16,50 +16,79 @@ limitations under the License. */ #include "FunctionTest.h" #include "paddle/math/Matrix.h" #include "paddle/math/SparseMatrix.h" +#include "paddle/math/tests/test_matrixUtil.h" #include "paddle/testing/TestUtil.h" using namespace paddle; // NOLINT -void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) { - /// todo(tianbing) check CPU/GPU +/** + * C = alpha * C + beta * (A * B) + */ +void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { + real alpha = 1.5; + real beta = 2.0; + + const auto cpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-CPU"); + cpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU"); - gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2)); + gpuFunc->init(FuncConfig().set("scaleAB", alpha).set("scaleT", beta)); - int nnz = M * N * rate; - MatrixPtr cpuA = std::make_shared(M, K); - MatrixPtr cpuB = std::make_shared(N, K); - MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz)); + int heightA = (transa == false) ? dimM : dimK; + int widthA = (transa == false) ? dimK : dimM; + int heightB = (transb == false) ? dimK : dimN; + int widthB = (transb == false) ? dimN : dimK; + int heightC = dimM; + int widthC = dimN; - MatrixPtr gpuA = std::make_shared(M, K); - MatrixPtr gpuB = std::make_shared(N, K); - MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz)); + auto cpuA = std::make_shared(heightA, widthA, transa); + auto cpuB = std::make_shared(heightB, widthB, transb); + auto cpuC = std::make_shared(heightC, widthC); + auto gpuA = std::make_shared(heightA, widthA, transa); + auto gpuB = std::make_shared(heightB, widthB, transb); + auto gpuC = std::make_shared(heightC, widthC); cpuA->randomizeUniform(); cpuB->randomizeUniform(); cpuC->randomizeUniform(); + gpuA->copyFrom(*cpuA); + gpuB->copyFrom(*cpuB); + gpuC->copyFrom(*cpuC); - hl_stream_t stream(HPPL_STREAM_3); - gpuA->copyFrom(*cpuA, stream); - gpuB->copyFrom(*cpuB, stream); - gpuC->copyFrom(*cpuC, stream); - hl_stream_synchronize(stream); + BufferArgs cpuInputs; + BufferArgs cpuOutputs; + cpuInputs.addArg(*cpuA); + cpuInputs.addArg(*cpuB); + cpuOutputs.addArg(*cpuC, ADD_TO); + cpuFunc->calc(cpuInputs, cpuOutputs); - BufferArgs inputs; - BufferArgs outputs; - inputs.addArg(*gpuA->getTranspose()); - inputs.addArg(*gpuB->getTranspose()); - outputs.addArg(*gpuC, ASSIGN_TO); + BufferArgs gpuInputs; + BufferArgs gpuOutputs; + gpuInputs.addArg(*gpuA); + gpuInputs.addArg(*gpuB); + gpuOutputs.addArg(*gpuC, ADD_TO); + gpuFunc->calc(gpuInputs, gpuOutputs); - gpuFunc->calc(inputs, outputs); + autotest::TensorCheckErr(*cpuC, *gpuC); } -TEST(SMatrix, sMatrixMul) { - for (auto M : {1, 40, 128, 200}) { - for (auto N : {100}) { - for (auto K : {100}) { - /// todo(tianbing), add scaleAB and scaleT - VLOG(3) << " M=" << M << " N=" << N << " K=" << K; - testSpMatrixMul(M, N, K, 0.05, 1, 1); +TEST(Matrix, mul) { + for (auto transa : {false, true}) { + for (auto transb : {false, true}) { + for (auto dimM : {1, 10, 100}) { + for (auto dimN : {1, 10}) { + for (auto dimK : {8}) { + if (true == transa && true == transb) { + continue; + } + VLOG(3) << setiosflags(std::ios::left) << std::setfill(' ') + << " transa=" << transa << " transb=" << transb + << " dimM=" << std::setw(5) << dimM + << " dimN=" << std::setw(5) << dimN + << " dimK=" << std::setw(5) << dimK; + + testMatrixMul(transa, transb, dimM, dimN, dimK); + } + } } } }