提交 cd506756 编写于 作者: 李寅

Gemm transpose

上级 c0c0dfe5
此差异已折叠。
......@@ -30,7 +30,9 @@ void Gemm(const float *A,
const index_t height,
const index_t K,
const index_t width,
float *C);
float *C,
const bool transpose_a = false,
const bool transpose_b = false);
void GemmRef(const float *A,
const float *B,
......@@ -38,7 +40,9 @@ void GemmRef(const float *A,
const index_t height,
const index_t K,
const index_t width,
float *C);
float *C,
const bool transpose_a = false,
const bool transpose_b = false);
void Gemv(const float *m_ptr,
const float *v_ptr,
......
......@@ -13,17 +13,22 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include <memory>
#include <random>
#include "mace/kernels/gemm.h"
#include "mace/core/types.h"
#include "mace/kernels/gemm.h"
namespace mace {
namespace {
void GemmTest(index_t batch, index_t N, index_t K, index_t M) {
void GemmTest(index_t batch,
index_t N,
index_t K,
index_t M,
bool transpose_a,
bool transpose_b) {
std::unique_ptr<float[]> A(new float[batch * N * K]);
std::unique_ptr<float[]> B(new float[batch * K * M]);
std::unique_ptr<float[]> C(new float[batch * N * M]);
......@@ -34,15 +39,13 @@ void GemmTest(index_t batch, index_t N, index_t K, index_t M) {
std::normal_distribution<float> nd(0, 1);
std::generate(A.get(), A.get() + batch * N * K,
[&gen, &nd] {
return nd(gen);
});
[&gen, &nd] { return nd(gen); });
std::generate(B.get(), B.get() + batch * K * M,
[&gen, &nd] {
return nd(gen);
});
kernels::Gemm(A.get(), B.get(), batch, N, K, M, C.get());
kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get());
[&gen, &nd] { return nd(gen); });
kernels::Gemm(A.get(), B.get(), batch, N, K, M, C.get(), transpose_a,
transpose_b);
kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
transpose_b);
for (int i = 0; i < batch * N * M; ++i) {
EXPECT_NEAR(C_ref[i], C[i], 0.1);
......@@ -59,14 +62,8 @@ void GemvTest(index_t batch, index_t N, index_t M) {
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
std::generate(A.get(), A.get() + N * M,
[&gen, &nd] {
return nd(gen);
});
std::generate(B.get(), B.get() + batch * M,
[&gen, &nd] {
return nd(gen);
});
std::generate(A.get(), A.get() + N * M, [&gen, &nd] { return nd(gen); });
std::generate(B.get(), B.get() + batch * M, [&gen, &nd] { return nd(gen); });
kernels::Gemv(A.get(), B.get(), batch, M, N, C.get());
kernels::GemvRef(A.get(), B.get(), batch, M, N, C_ref.get());
......@@ -78,36 +75,36 @@ void GemvTest(index_t batch, index_t N, index_t M) {
} // namespace
TEST(GEMMTest, AlignedWithoutBatch) {
GemmTest(1, 1, 64, 128);
GemmTest(1, 2, 64, 128);
GemmTest(1, 3, 64, 128);
GemmTest(1, 4, 64, 128);
GemmTest(1, 5, 64, 128);
GemmTest(1, 6, 64, 128);
GemmTest(1, 7, 64, 128);
GemmTest(1, 17, 64, 128);
GemmTest(1, 1, 64, 128, false, false);
GemmTest(1, 2, 64, 128, false, true);
GemmTest(1, 3, 64, 128, true, false);
GemmTest(1, 4, 64, 128, true, true);
GemmTest(1, 5, 64, 128, false, false);
GemmTest(1, 6, 64, 128, false, true);
GemmTest(1, 7, 64, 128, true, false);
GemmTest(1, 17, 64, 128, true, true);
}
TEST(GEMMTest, UnalignedWithoutBatch) {
GemmTest(1, 1, 63, 127);
GemmTest(1, 2, 63, 127);
GemmTest(1, 3, 63, 127);
GemmTest(1, 4, 63, 127);
GemmTest(1, 5, 63, 127);
GemmTest(1, 6, 63, 127);
GemmTest(1, 7, 63, 127);
GemmTest(1, 17, 63, 127);
GemmTest(1, 1, 63, 127, false, false);
GemmTest(1, 2, 63, 127, false, true);
GemmTest(1, 3, 63, 127, true, false);
GemmTest(1, 4, 63, 127, true, true);
GemmTest(1, 5, 63, 127, false, false);
GemmTest(1, 6, 63, 127, false, true);
GemmTest(1, 7, 63, 127, true, false);
GemmTest(1, 17, 63, 127, true, true);
}
TEST(GEMMTest, UnalignedWithBatch) {
GemmTest(3, 1, 63, 127);
GemmTest(3, 2, 63, 127);
GemmTest(3, 3, 63, 127);
GemmTest(3, 4, 63, 127);
GemmTest(3, 5, 63, 127);
GemmTest(3, 6, 63, 127);
GemmTest(3, 7, 63, 127);
GemmTest(3, 17, 63, 127);
GemmTest(3, 1, 63, 127, false, false);
GemmTest(3, 2, 63, 127, false, true);
GemmTest(3, 3, 63, 127, true, false);
GemmTest(3, 4, 63, 127, true, true);
GemmTest(3, 5, 63, 127, false, false);
GemmTest(3, 6, 63, 127, false, true);
GemmTest(3, 7, 63, 127, true, false);
GemmTest(3, 17, 63, 127, true, true);
}
TEST(GEMMTest, gemv) {
......
......@@ -20,6 +20,8 @@
#endif
#include <algorithm>
#include <utility>
#include <functional>
#include <memory>
#include <string>
#include <vector>
......@@ -36,14 +38,39 @@
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct MatMulFunctor {
MaceStatus operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
index_t batch;
index_t height;
index_t K;
index_t width;
index_t rank = A->dim_size();
height = A->dim(rank - 2);
K = A->dim(rank - 1);
if (transpose_a) {
std::swap(height, K);
}
if (transpose_b) {
width = B->dim(rank - 2);
} else {
width = B->dim(rank - 1);
}
batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A);
......@@ -53,28 +80,27 @@ struct MatMulFunctor {
const T *b_ptr_base = B->data<T>();
T *c_ptr_base = C->mutable_data<T>();
const index_t batch = C->dim(0);
const index_t height = C->dim(1);
const index_t width = C->dim(2);
const index_t K = A->dim(2);
// It is better to use large block size if it fits for fast cache.
// Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
// the block size should be sqrt(32k / sizeof(T) / 3).
memset(c_ptr_base, 0, batch * height * width * sizeof(T));
Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);
Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base,
transpose_a, transpose_b);
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> {
MaceStatus operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future);
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
......
......@@ -134,7 +134,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
if (buffer->dim_size() < 4) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
}
}
b2f_kernel.setArg(idx++, *(image->opencl_image()));
......
......@@ -76,19 +76,27 @@ void CalWinogradFilterImageShape(
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
std::vector<index_t> padded_shape = shape;
while (padded_shape.size() < 4) {
padded_shape.push_back(1);
}
MACE_CHECK(padded_shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[2] * shape[3];
(*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
(*image_shape)[0] = padded_shape[2] * padded_shape[3];
(*image_shape)[1] = padded_shape[0] * RoundUpDiv4(padded_shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
std::vector<index_t> padded_shape = shape;
while (padded_shape.size() < 4) {
padded_shape.push_back(1);
}
MACE_CHECK(padded_shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
(*image_shape)[1] = shape[0] * shape[1];
(*image_shape)[0] = RoundUpDiv4(padded_shape[2]) * padded_shape[3];
(*image_shape)[1] = padded_shape[0] * padded_shape[1];
}
// [Ic * H * W, (Oc + 3) / 4]
......@@ -150,10 +158,10 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) {
if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1};
return {16, shape[0], shape[1]};
} else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1};
return {16, shape[3], out_width};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
......
......@@ -122,7 +122,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
if (buffer->dim_size() < 4) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
}
}
b2f_kernel.setArg(idx++, *(image->opencl_image()));
......
......@@ -24,17 +24,27 @@ template <typename T>
MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t batch = C->dim(0);
const index_t height = C->dim(1);
const index_t width = C->dim(2);
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
......@@ -82,13 +92,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
C->dim(2), C->dim(3));
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -74,7 +74,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1};
output_shape = {16, input_tensor->dim(3), out_width};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
......@@ -104,7 +104,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -25,24 +25,37 @@ template <DeviceType D, class T>
class MatMulOp : public Operator<D, T> {
public:
MatMulOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
: Operator<D, T>(operator_def, ws),
transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
}
MaceStatus Run(StatsFuture *future) override {
const Tensor *A = this->Input(0);
const Tensor *B = this->Input(1);
Tensor *C = this->Output(0);
MACE_CHECK(A->dim_size() == 4 && 4 == B->dim_size())
<< "The dimension of A and B should be 4";
MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size";
MACE_CHECK(A->dim(2) == B->dim(1))
<< "the number of A's column " << A->dim(2)
<< " must be equal to B's row " << B->dim(1);
return functor_(A, B, C, future);
const Tensor *A = this->Input(INPUT_A);
const Tensor *B = this->Input(INPUT_B);
Tensor *C = this->Output(OUTPUT);
MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2,
"rank(A) should be equal to rank(B), rank should be greater "
"than or equal to 2");
index_t rank = A->dim_size();
for (index_t i = 0; i < rank - 2; ++i) {
MACE_CHECK(A->dim(i) == B->dim(i), "batch dimensions are not equal");
}
index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);
MACE_CHECK(ak == bk, "the number of A's column ", ak,
" must be equal to B's row ", bk);
return functor_(A, B, C, transpose_a_, transpose_b_, future);
}
private:
MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B);
MACE_OP_OUTPUT_TAGS(OUTPUT);
kernels::MatMulFunctor<D, T> functor_;
bool transpose_a_;
bool transpose_b_;
};
} // namespace ops
......
......@@ -31,8 +31,8 @@ void MatMulBenchmark(
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("A", {batch, height, channels, 1});
net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
net.AddRandomInput<D, float>("A", {batch, height, channels});
net.AddRandomInput<D, float>("B", {batch, channels, out_width});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
......@@ -65,6 +65,41 @@ void MatMulBenchmark(
}
net.Sync();
}
template <DeviceType D, typename T>
void MatMulTransposeBenchmark(
int iters, int batch, int height, int channels, int out_width) {
mace::testing::StopTiming();
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("A", {batch, height, channels});
net.AddRandomInput<D, float>("B", {batch, out_width, channels});
if (D == DeviceType::CPU) {
OpDefBuilder("MatMul", "MatMulBM")
.Input("A")
.Input("B")
.AddIntArg("transpose_b", 1)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
} // namespace
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
......@@ -83,6 +118,20 @@ void MatMulBenchmark(
MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);
#define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
MACE_BM_MATMUL(16, 32, 128, 49);
MACE_BM_MATMUL(16, 32, 128, 961);
MACE_BM_MATMUL(16, 32, 128, 3969);
......@@ -90,6 +139,13 @@ MACE_BM_MATMUL(16, 128, 128, 49);
MACE_BM_MATMUL(16, 128, 128, 961);
MACE_BM_MATMUL(16, 128, 128, 3969);
MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 49);
MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 961);
MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 3969);
MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961);
MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -72,46 +72,46 @@ void Simple(const std::vector<index_t> &A_shape,
} // namespace
TEST_F(MatMulOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::CPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
{1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
Simple<DeviceType::CPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
{1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
Simple<DeviceType::CPU>({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
}
TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::GPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
{1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
Simple<DeviceType::GPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
{1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
Simple<DeviceType::CPU>({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
}
namespace {
template <typename T>
void Complex(const index_t batch,
void Complex(const std::vector<index_t> &batch,
const index_t height,
const index_t channels,
const index_t out_width) {
......@@ -119,23 +119,14 @@ void Complex(const index_t batch,
// Construct graph
OpsTestNet net;
OpDefBuilder("MatMul", "MatMulTest")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("A", {batch, height, channels, 1});
net.AddRandomInput<DeviceType::GPU, float>("B",
{batch, channels, out_width, 1});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1,
std::multiplies<index_t>());
net.AddRandomInput<DeviceType::GPU, float>("A",
{batch_count, height, channels});
net.AddRandomInput<DeviceType::GPU, float>(
"B", {batch_count, channels, out_width});
// Run on opencl
BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
......@@ -150,11 +141,40 @@ void Complex(const index_t batch,
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_HEIGHT);
// run cpu
std::vector<index_t> shape_a = batch;
shape_a.push_back(height);
shape_a.push_back(channels);
std::vector<index_t> shape_b = batch;
shape_b.push_back(channels);
shape_b.push_back(out_width);
std::vector<index_t> expected_output_shape = batch;
expected_output_shape.push_back(height);
expected_output_shape.push_back(out_width);
net.GetTensor("A")->Reshape(shape_a);
net.GetTensor("B")->Reshape(shape_b);
OpDefBuilder("MatMul", "MatMulTest")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
expected.Reshape({batch_count, height, out_width});
if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1);
......@@ -166,28 +186,36 @@ void Complex(const index_t batch,
} // namespace
TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
Complex<float>(1, 64, 128, 32);
Complex<float>(1, 64, 32, 128);
Complex<float>({1}, 64, 128, 32);
Complex<float>({1}, 64, 32, 128);
Complex<float>({2, 3}, 64, 32, 128);
}
TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) {
Complex<float>(1, 31, 113, 61);
Complex<float>(1, 113, 31, 73);
Complex<float>({1}, 31, 113, 61);
Complex<float>({1}, 113, 31, 73);
Complex<float>({2, 3}, 113, 31, 73);
}
TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) {
Complex<float>(2, 3, 3, 3);
Complex<float>(16, 31, 61, 67);
Complex<float>(31, 31, 61, 67);
Complex<float>({2}, 3, 3, 3);
Complex<float>({16}, 31, 61, 67);
Complex<float>({31}, 31, 61, 67);
Complex<float>({2, 3}, 31, 61, 67);
}
TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) {
Complex<half>(1, 64, 128, 32);
Complex<half>(1, 64, 32, 128);
Complex<half>({1}, 64, 128, 32);
Complex<half>({1}, 64, 32, 128);
Complex<half>({2, 3}, 64, 32, 128);
}
TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(2, 31, 113, 61);
Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67);
Complex<half>({2}, 31, 113, 61);
Complex<half>({16}, 32, 64, 64);
Complex<half>({31}, 31, 61, 67);
Complex<half>({2, 3}, 31, 61, 67);
}
// TODO(liyin): test transpose after implementing gpu runtime
// now transpose test is in kernels_test
} // namespace test
} // namespace ops
} // namespace mace
......@@ -518,7 +518,7 @@ class Transformer(base_converter.ConverterInterface):
wt_output_width = batch * (
(out_height + 1) / 2) * ((out_width + 1) / 2)
wt_output_shape.dims.extend(
[16, in_channels, wt_output_width, 1])
[16, in_channels, wt_output_width])
if ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str) \
......@@ -543,7 +543,7 @@ class Transformer(base_converter.ConverterInterface):
matmul_op.output.extend([matmul_op.name])
matmul_output_shape = matmul_op.output_shape.add()
matmul_output_shape.dims.extend(
[16, out_channels, wt_output_width, 1])
[16, out_channels, wt_output_width])
arg = matmul_op.arg.add()
arg.name = MaceKeyword.mace_winograd_filter_transformed
......
......@@ -167,7 +167,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
def get_op_mem_block(self, op_type, output_shape):
mem_block = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_block[0] = output_shape[2] * output_shape[3]
mem_block[0] = output_shape[2]
mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
else:
mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册