提交 1f0cbcf3 编写于 作者: X xutianbing

add GpuMatrix::mul, CpuMatrix::mul operators

上级 936301f1
......@@ -167,7 +167,7 @@ public:
ValueType valueType() const { return valueType_; }
BufferType bufferType() const { return bufferType_; }
const TensorShape& shape() const { return shape_; }
bool isSparse() const { return TENSOR_SPARSE == bufferType_; }
bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
const SequenceArg& sequence() const;
......
......@@ -13,16 +13,471 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "MulOp.h"
#include "paddle/math/MathFunctions.h"
#include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace {
inline void vecAddTo(real* a, const real* b, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
a[i] += b[i];
}
}
inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
a[i] += scaleB * b[i];
}
}
inline void colVecAddTo(
real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
for (unsigned int i = 0; i < len; ++i) {
a[i * aWidth] += b[i * bWidth];
}
}
inline void colVecAddTo(
real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
for (unsigned int i = 0; i < len; ++i) {
a[i * aWidth] += b[i * bWidth] * c;
}
}
} // namespace
namespace paddle {
template <>
void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT) {
/// todo(tianbing), clean the code
CHECK(!out.isTransposed()) << "Not supported";
CHECK_EQ(out.getValueType(), FLOAT_VALUE);
const real* A = a.getData();
const real* B = b.getData();
real* C = out.getValue();
int* rows = out.getRows();
int* cols = out.getCols();
size_t height = out.getHeight();
size_t width = out.getWidth();
if (scaleT == 0) {
out.zeroMem();
}
if (!a.isTransposed() && !b.isTransposed()) {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), m);
CHECK_EQ(a.getHeight(), height);
CHECK_EQ(b.getWidth(), width);
if (out.getFormat() == SPARSE_CSC) {
for (size_t i = 0; i < width; i++) {
size_t start = out.getColStartIdx(i);
size_t end = out.getColStartIdx(i + 1);
for (size_t j = start; j < end; j++) {
real sum = 0;
size_t rowIdx = rows[j];
for (size_t k = 0; k < m; k++) {
sum += A[rowIdx * m + k] * B[k * width + i];
}
C[j] = scaleAB * sum + scaleT * C[j];
}
}
} else {
for (size_t i = 0; i < height; i++) {
size_t start = out.getRowStartIdx(i);
size_t end = out.getRowStartIdx(i + 1);
for (size_t j = start; j < end; j++) {
real sum = 0;
size_t colIdx = cols[j];
for (size_t k = 0; k < m; k++) {
sum += A[i * m + k] * B[k * width + colIdx];
}
C[j] = scaleAB * sum + scaleT * C[j];
}
}
}
} else if (a.isTransposed() && !b.isTransposed()) {
size_t m = a.getHeight();
CHECK_EQ(m, b.getHeight());
CHECK_EQ(b.getWidth(), width);
CHECK_EQ(a.getWidth(), height);
if (out.getFormat() == SPARSE_CSC) {
for (size_t i = 0; i < width; i++) {
size_t start = out.getColStartIdx(i);
size_t end = out.getColStartIdx(i + 1);
for (size_t j = start; j < end; j++) {
real sum = 0;
size_t rowIdx = rows[j];
for (size_t k = 0; k < m; k++) {
sum += A[k * height + rowIdx] * B[k * width + i];
}
C[j] = scaleAB * sum + scaleT * C[j];
}
}
} else {
for (size_t i = 0; i < height; i++) {
int start = out.getRowStartIdx(i);
int end = out.getRowStartIdx(i + 1);
for (int j = start; j < end; j++) {
real sum = 0;
size_t colIdx = cols[j];
for (size_t k = 0; k < m; k++) {
sum += A[k * height + i] * B[k * width + colIdx];
}
C[j] = scaleAB * sum + scaleT * C[j];
}
}
}
} else if (!a.isTransposed() && b.isTransposed()) {
size_t m = a.getWidth();
CHECK_EQ(b.getWidth(), m);
CHECK_EQ(a.getHeight(), height);
CHECK_EQ(b.getHeight(), width);
if (out.getFormat() == SPARSE_CSR) {
for (size_t i = 0; i < height; i++) {
size_t start = out.getRowStartIdx(i);
size_t end = out.getRowStartIdx(i + 1);
for (size_t j = start; j < end; j++) {
real sum = 0;
size_t colIdx = cols[j];
for (size_t k = 0; k < m; k++) {
sum += A[i * m + k] * B[colIdx * m + k];
}
C[j] = scaleAB * sum + scaleT * C[j];
}
}
} else {
LOG(FATAL) << "Not supported csc format "
"when a is not trans and b is trans";
}
} else {
LOG(FATAL) << "Not supported";
}
}
template <>
void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT) {
/// todo(tianbing), clean the code
CHECK(!out.isTransposed()) << "Not supported";
CBLAS_TRANSPOSE aTrans = CblasNoTrans;
size_t aRow = a.getHeight();
size_t aCol = a.getWidth();
CBLAS_TRANSPOSE bTrans = CblasNoTrans;
size_t bRow = b.getHeight();
size_t bCol = b.getWidth();
if (a.isTransposed()) {
aTrans = CblasTrans;
aRow = a.getWidth();
aCol = a.getHeight();
}
if (b.isTransposed()) {
bTrans = CblasTrans;
bRow = b.getWidth();
bCol = b.getHeight();
}
/// C = A * B, for matrix format
CHECK_EQ(aCol, bRow);
CHECK_EQ(aRow, out.getHeight());
CHECK_EQ(bCol, out.getWidth());
const real* A = a.getData();
const real* B = b.getData();
real* C = out.getData();
int M = out.getHeight();
int N = out.getWidth();
int K = aCol;
int lda = a.getStride();
int ldb = b.getStride();
int ldc = out.getStride();
GEMM(aTrans, bTrans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
<< " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
}
static ThreadLocal<std::vector<const real*>> threadLocalColArray;
template <>
void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
const CpuSparseMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT) {
/// todo(tianbing), clean the code
CHECK(!out.isTransposed()) << "Not supported";
CHECK(!b.isTransposed()) << "Not supported";
CHECK(scaleT == 0 || scaleT == 1) << "Not support";
CHECK_EQ(scaleAB, static_cast<real>(1.0)) << "Not supported";
CHECK_EQ(a.getFormat(), SPARSE_CSR) << "Not supported";
const real* B = b.getData();
real* C = out.getData();
size_t height = out.getHeight();
size_t width = out.getWidth();
int* cols = a.getCols();
real* values = a.getValue();
if (scaleT == 0) {
out.zeroMem();
}
if (!a.isTransposed()) {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), m);
CHECK_EQ(a.getHeight(), height);
CHECK_EQ(b.getWidth(), width);
if (a.getValueType() == NO_VALUE) {
if (width % 32 == 0) { // use libaddto
CHECK_EQ((size_t)B % 32, 0UL);
CHECK_EQ((size_t)C % 32, 0UL);
auto& colArray = *threadLocalColArray;
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
size_t colNum = end - start;
colArray.resize(colNum);
for (int j = 0; j < end - start; ++j) {
colArray[j] = const_cast<CpuMatrix&>(b).getRow(cols[j + start]);
}
simd::batchAddTo(out.getRow(i), &colArray[0], colNum, width);
}
} else {
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
vecAddTo(out.getRow(i),
const_cast<CpuMatrix&>(b).getRow(cols[j]),
width);
}
}
}
} else if (a.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
vecAddTo(out.getRow(i),
const_cast<CpuMatrix&>(b).getRow(cols[j]),
values[j],
width);
}
}
}
} else /*if (a->isTransposed())*/ {
size_t m = a.getHeight();
CHECK_EQ(b.getHeight(), m);
CHECK_EQ(a.getWidth(), height);
CHECK_EQ(b.getWidth(), width);
if (a.getValueType() == NO_VALUE) {
if (width % 32 == 0) { // use libaddto
CHECK_EQ((size_t)B % 32, 0UL);
CHECK_EQ((size_t)C % 32, 0UL);
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
simd::addTo(out.getRow(cols[j]),
const_cast<CpuMatrix&>(b).getRow(i),
width);
}
}
} else {
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
vecAddTo(out.getRow(cols[j]),
const_cast<CpuMatrix&>(b).getRow(i),
width);
}
}
}
} else if (a.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < a.getHeight(); ++i) {
const int start = a.getRowStartIdx(i);
const int end = a.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
vecAddTo(out.getRow(cols[j]),
const_cast<CpuMatrix&>(b).getRow(i),
values[j],
width);
}
}
}
}
}
template <>
void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
const CpuMatrix& a,
const CpuSparseMatrix& b,
real scaleAB,
real scaleT) {
/// todo(tianbing), clean the code
CHECK(!out.trans_) << "Not supported";
CHECK(!a.isTransposed()) << "Not supported";
CHECK(scaleT == 0 || scaleT == 1);
CHECK_EQ(scaleAB, static_cast<real>(1.0));
real* A = const_cast<real*>(a.getData());
real* B = const_cast<real*>(b.getValue());
real* C = out.getData();
int* rows = b.getRows();
int* cols = b.getCols();
if (scaleT == 0) {
out.zeroMem();
}
/// todo(tianbing), clean the code
if (b.getFormat() == SPARSE_CSC) {
if (!b.isTransposed()) {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), m);
CHECK_EQ(a.getHeight(), out.height_);
CHECK_EQ(b.getWidth(), out.width_);
if (b.getValueType() == NO_VALUE) {
for (size_t j = 0; j < b.getWidth(); ++j) {
int start = b.getColStartIdx(j);
int end = b.getColStartIdx(j + 1);
for (int i = start; i < end; ++i) {
colVecAddTo(
C + j, A + rows[i], out.height_, out.width_, a.getWidth());
}
}
} else if (b.getValueType() == FLOAT_VALUE) {
for (size_t j = 0; j < b.getWidth(); ++j) {
int start = b.getColStartIdx(j);
int end = b.getColStartIdx(j + 1);
for (int i = start; i < end; ++i) {
colVecAddTo(C + j,
A + rows[i],
B[i],
out.height_,
out.width_,
a.getWidth());
}
}
}
} else /*if (b.isTransposed())*/ {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), out.width_);
CHECK_EQ(a.getHeight(), out.height_);
CHECK_EQ(b.getWidth(), m);
if (b.getValueType() == NO_VALUE) {
for (size_t i = 0; i < b.getWidth(); ++i) {
int start = b.getColStartIdx(i);
int end = b.getColStartIdx(i + 1);
for (int j = start; j < end; ++j) {
colVecAddTo(
C + rows[j], A + i, out.height_, out.width_, a.getWidth());
}
}
} else if (b.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < b.getWidth(); ++i) {
int start = b.getColStartIdx(i);
int end = b.getColStartIdx(i + 1);
for (int j = start; j < end; ++j) {
colVecAddTo(C + rows[j],
A + i,
B[j],
out.height_,
out.width_,
a.getWidth());
}
}
}
}
} else {
if (!b.isTransposed()) {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), m);
CHECK_EQ(a.getHeight(), out.height_);
CHECK_EQ(b.getWidth(), out.width_);
if (b.getValueType() == NO_VALUE) {
for (size_t j = 0; j < b.getHeight(); ++j) {
int start = b.getRowStartIdx(j);
int end = b.getRowStartIdx(j + 1);
for (int i = start; i < end; ++i) {
colVecAddTo(
C + cols[i], A + j, out.height_, out.width_, a.getWidth());
}
}
} else if (b.getValueType() == FLOAT_VALUE) {
for (size_t j = 0; j < b.getHeight(); ++j) {
int start = b.getRowStartIdx(j);
int end = b.getRowStartIdx(j + 1);
for (int i = start; i < end; ++i) {
colVecAddTo(C + cols[i],
A + j,
B[i],
out.height_,
out.width_,
a.getWidth());
}
}
}
} else /*if (b.isTransposed())*/ {
size_t m = a.getWidth();
CHECK_EQ(b.getHeight(), out.width_);
CHECK_EQ(a.getHeight(), out.height_);
CHECK_EQ(b.getWidth(), m);
if (b.getValueType() == NO_VALUE) {
for (size_t i = 0; i < b.getHeight(); ++i) {
int start = b.getRowStartIdx(i);
int end = b.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
colVecAddTo(
C + i, A + cols[j], out.height_, out.width_, a.getWidth());
}
}
} else if (b.getValueType() == FLOAT_VALUE) {
for (size_t i = 0; i < b.getHeight(); ++i) {
int start = b.getRowStartIdx(i);
int end = b.getRowStartIdx(i + 1);
for (int j = start; j < end; ++j) {
colVecAddTo(C + i,
A + cols[j],
B[j],
out.height_,
out.width_,
a.getWidth());
}
}
}
}
}
}
/**
* mul operator
* out = scaleT * out + scaleAB*(in1 * in2)
*
* \param outputs[0] output matrix, N * M
* \param inputs[0] first input (sparse) matrix, N * K
* \param inputs[1] second input matrix, K * M (non-transpose)
* \param outputs[0] output matrix, M * N
* \param inputs[0] first input (sparse) matrix, M * K (if non-trans)
* \param inputs[1] second input matrix, K * N (if non-trans)
*/
template <DeviceType Device>
class MulFunc : public FunctionBase {
......@@ -33,19 +488,23 @@ public:
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
/// todo(tianbing), add more checks
CHECK_EQ((size_t)1, inputs.size());
CHECK_EQ((size_t)2, outputs.size());
CHECK_EQ((size_t)2, inputs.size());
CHECK_EQ((size_t)1, outputs.size());
CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
CHECK(inputs[0].isSparse()) << "SparseMatrix requried here";
const auto in1_mat = inputs[0].sparse().SparseMatrix<Device>();
auto in1_mat = inputs[0].matrix<Device>();
if (inputs[0].isSparseArg()) {
in1_mat = inputs[0].sparse().SparseMatrix<Device>();
}
auto in2_mat = inputs[1].matrix<Device>();
if (inputs[1].isSparseArg()) {
in2_mat = inputs[1].sparse().SparseMatrix<Device>();
}
auto out_mat = outputs[0].matrix<Device>();
const auto in2_mat = inputs[1].matrix<Device>();
MulOp<Device>(out_mat, in1_mat, in2_mat, scaleAB_, scaleT_);
}
......@@ -54,6 +513,7 @@ private:
real scaleT_;
};
REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
#endif
......
......@@ -19,6 +19,40 @@ limitations under the License. */
#include "paddle/math/SparseMatrix.h"
namespace paddle {
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuSparseMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuSparseMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuSparseMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
......@@ -27,4 +61,11 @@ void MulOp(GpuMatrix& out,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scaleAB,
real scaleT);
} // namespace paddle
......@@ -20,6 +20,65 @@ limitations under the License. */
namespace paddle {
/**
* out = scale_t * out + scale_ab * (a * b)
* out : output matrix, M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scale_ab,
real scale_t) {
CHECK(!out.isTransposed()) << "Not supported";
if (!a.isTransposed() && !b.isTransposed()) {
/// a : M * K, b: K * N
CHECK_EQ(out.width_, b.width_);
CHECK_EQ(out.height_, a.height_);
CHECK_EQ(a.width_, b.height_);
} else if (a.isTransposed() && !b.isTransposed()) {
/// a : K * M, b : K * N
CHECK_EQ(out.width_, b.width_);
CHECK_EQ(out.height_, a.width_);
CHECK_EQ(a.height_, b.height_);
} else if (!a.isTransposed() && b.isTransposed()) {
/// a: M * K, b : N * K
CHECK_EQ(out.width_, b.height_);
CHECK_EQ(out.height_, a.height_);
CHECK_EQ(a.width_, b.width_);
} else {
LOG(FATAL) << "Is not supported";
}
real* a_data = a.data_;
real* b_data = b.data_;
real* out_data = out.data_;
int dim_m = out.getHeight();
int dim_n = out.getWidth();
int dim_k = !a.isTransposed() ? a.width_ : a.height_;
int lda = a.getStride();
int ldb = b.getStride();
int ldc = out.getStride();
hl_trans_op_t trans_a = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
hl_trans_op_t trans_b = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
hl_matrix_mul(a_data,
trans_a,
b_data,
trans_b,
out_data,
dim_m,
dim_n,
dim_k,
scale_ab,
scale_t,
lda,
ldb,
ldc);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
......@@ -32,12 +91,15 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
CHECK(b.useGpu_ == true) << "Matrix type are not equal";
CHECK(!out.trans_ && !b.trans_) << "not supported";
if (!a.trans_) {
/// a: M * K, b: K * N
CHECK(out.width_ == b.width_ && out.height_ == a.height_
&& a.width_ == b.height_) << "Matrix dimensions are not equal";
&& a.width_ == b.height_) << "Matrix dimensions are not equal";
} else {
/// a: K * M, transpose, b: K * N
CHECK(out.width_ == b.width_ && out.height_ == a.width_
&& a.height_ == b.height_) << "Matrix dimensions are not equal";
&& a.height_ == b.height_) << "Matrix dimensions are not equal";
}
hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
hl_sparse_matrix_s a_data = a.sMatrix_.get();
real* b_data = b.data_;
......@@ -54,4 +116,58 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
scale_t);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scale_ab,
real scale_t) {
CHECK(out.isContiguous());
CHECK(a.isContiguous());
CHECK(a.useGpu_ == true) << "Matrix type are not equal";
hl_sparse_matrix_s b_data = b.sMatrix_.get();
real* a_data = a.data_;
real* out_data = out.data_;
hl_trans_op_t trans_b = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
if (!b.trans_) {
/// a : M * K, b : K * N
CHECK(out.width_ == b.width_ &&
out.height_ == a.height_ && a.width_ == b.height_)
<< "Matrix dimensions are not equal";
} else {
/// a : M * K, b : N * K, transpose
CHECK(out.width_ == b.height_ &&
out.height_ == a.height_ && a.width_ == b.width_)
<< "Matrix dimensions are not equal";
}
if (b.format_ == SPARSE_CSC) {
hl_matrix_dense_mul_csc(a_data,
HPPL_OP_N,
b_data,
trans_b,
out_data,
out.height_,
out.width_,
a.width_,
scale_ab,
scale_t);
} else {
hl_matrix_dense_mul_csr(a_data,
HPPL_OP_N,
b_data,
trans_b,
out_data,
out.height_,
out.width_,
a.width_,
scale_ab,
scale_t);
}
}
} // namespace paddle
......@@ -22,31 +22,41 @@ using namespace paddle; // NOLINT
void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) {
/// todo(tianbing) check CPU/GPU
const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOP-GPU");
const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2));
int nnz = M * K * rate;
auto gpuA = std::make_shared<GpuSparseMatrix>(M, K, nnz);
const auto gpuB = std::make_shared<GpuMatrix>(K, N);
const auto gpuOut = std::make_shared<GpuMatrix>(M, N);
int nnz = M * N * rate;
MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
gpuA->randomizeUniform();
gpuB->randomizeUniform();
gpuOut->randomizeUniform();
MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
cpuA->randomizeUniform();
cpuB->randomizeUniform();
cpuC->randomizeUniform();
hl_stream_t stream(HPPL_STREAM_3);
gpuA->copyFrom(*cpuA, stream);
gpuB->copyFrom(*cpuB, stream);
gpuC->copyFrom(*cpuC, stream);
hl_stream_synchronize(stream);
BufferArgs inputs;
BufferArgs outputs;
inputs.addArg(*gpuA);
inputs.addArg(*gpuB);
outputs.addArg(*gpuOut);
inputs.addArg(*gpuA->getTranspose());
inputs.addArg(*gpuB->getTranspose());
outputs.addArg(*gpuC, ASSIGN_TO);
gpuFunc->calc(inputs, outputs);
}
TEST(SMatrix, sMatrixMul) {
for (auto M : {1, 40, 128, 200}) {
for (auto N : {100, 2000, 20480}) {
for (auto K : {100, 512, 1024}) {
for (auto N : {100}) {
for (auto K : {100}) {
/// todo(tianbing), add scaleAB and scaleT
VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
testSpMatrixMul(M, N, K, 0.05, 1, 1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册