提交 57a3298d 编写于 作者: 李滨

Merge branch 'pack' into 'master'

Pack matmul to improve performance

See merge request !789
......@@ -85,7 +85,7 @@ ndk_versions_compatible_tests:
- DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
- prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
- >
for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
do
new_ndk_path=${prefix_path}${ndk};
if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
......
......@@ -399,6 +399,10 @@ class Tensor {
zero_point_ = zero_point;
}
inline void SetIsWeight(bool is_weight) {
is_weight_ = is_weight;
}
private:
Allocator *allocator_;
DataType dtype_;
......@@ -409,7 +413,7 @@ class Tensor {
bool is_buffer_owner_;
bool unused_;
std::string name_;
const bool is_weight_;
bool is_weight_;
float scale_;
int32_t zero_point_;
......
......@@ -33,7 +33,8 @@ int main(int argc, char **argv) {
// config runtime
mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy(
FLAGS_omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
true);
if (status != mace::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
......
......@@ -13,11 +13,13 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <vector>
#include <memory>
#include <random>
#include "mace/core/types.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/sgemm.h"
namespace mace {
......@@ -72,6 +74,74 @@ void GemvTest(index_t batch, index_t N, index_t M) {
}
}
void SGemmTest(index_t batch,
index_t N,
index_t K,
index_t M,
bool transpose_a,
bool transpose_b) {
std::unique_ptr<float[]> A(new float[batch * N * K]);
std::unique_ptr<float[]> B(new float[batch * K * M]);
std::unique_ptr<float[]> C(new float[batch * N * M]);
std::unique_ptr<float[]> C_ref(new float[batch * N * M]);
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
std::generate(A.get(), A.get() + batch * N * K,
[&gen, &nd] { return nd(gen); });
std::generate(B.get(), B.get() + batch * K * M,
[&gen, &nd] { return nd(gen); });
kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
transpose_b);
kernels::MatrixMap<const float> matrix_a;
kernels::MatrixMap<const float> matrix_b;
if (!transpose_a) {
matrix_a =
kernels::MatrixMap<const float>(batch,
N,
K,
kernels::RowMajor,
A.get());
} else {
matrix_a =
kernels::MatrixMap<const float>(batch,
K,
N,
kernels::RowMajor,
A.get());
matrix_a = matrix_a.transpose();
}
if (!transpose_b) {
matrix_b =
kernels::MatrixMap<const float>(batch,
K,
M,
kernels::RowMajor,
B.get());
} else {
matrix_b =
kernels::MatrixMap<const float>(batch,
M,
K,
kernels::RowMajor,
B.get());
matrix_b = matrix_b.transpose();
}
kernels::MatrixMap<float> matrix_c(batch, N, M, kernels::RowMajor, C.get());
kernels::SGemm sgemm;
sgemm(matrix_a, matrix_b, &matrix_c);
for (int i = 0; i < N * M; ++i) {
EXPECT_NEAR(C_ref[i], C[i], 0.1);
}
}
} // namespace
TEST(GEMMTest, AlignedWithoutBatch) {
......@@ -114,4 +184,25 @@ TEST(GEMMTest, gemv) {
GemvTest(3, 17, 63);
}
namespace {
void TestSGemmTranspose(index_t batch, index_t N, index_t K, index_t M) {
SGemmTest(batch, N, K, M, false, false);
SGemmTest(batch, N, K, M, true, false);
SGemmTest(batch, N, K, M, false, true);
SGemmTest(batch, N, K, M, true, true);
}
}
TEST(SGEMMTest, UnalignedWithoutBatch) {
std::vector<index_t> tests{1, 5, 14, 31, 47};
for (index_t N : tests) {
for (index_t K : tests) {
for (index_t M : tests) {
TestSGemmTranspose(1, N, K, M);
TestSGemmTranspose(16, N, K, M);
}
}
}
}
} // namespace mace
......@@ -32,6 +32,7 @@
#include "mace/kernels/kernel.h"
#include "mace/utils/utils.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -83,39 +84,34 @@ struct MatMulFunctor : OpKernel {
const T *b_ptr_base = B->data<T>();
T *c_ptr_base = C->mutable_data<T>();
memset(c_ptr_base, 0, batch * height * width * sizeof(T));
if (height == 1 && width > 1 && B->is_weight()) {
// A * B = (B^T * A^T)^T
if (!transpose_b) {
if (B_transpose_.get() == nullptr) {
B_transpose_.reset(new Tensor(context_->device()->allocator(),
DataTypeToEnum<T>::v()));
B_transpose_->Resize({batch, width, K});
Tensor::MappingGuard guardbt(B_transpose_.get());
T *bt_ptr_base = B_transpose_->mutable_data<T>();
Transpose(b_ptr_base, K, width, width, bt_ptr_base);
}
Tensor::MappingGuard guardbt(B_transpose_.get());
T *bt_ptr_base = B_transpose_->mutable_data<T>();
Gemv(bt_ptr_base, a_ptr_base, batch, K, width, c_ptr_base);
} else {
Gemv(b_ptr_base, a_ptr_base, batch, K, width, c_ptr_base);
}
} else {
Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base,
transpose_a, transpose_b);
}
const index_t height_a = A->dim(rank - 2);
const index_t width_a = A->dim(rank - 1);
const index_t height_b = B->dim(rank - 2);
const index_t width_b = B->dim(rank - 1);
sgemm_.Run(a_ptr_base,
b_ptr_base,
batch,
height_a,
width_a,
height_b,
width_b,
transpose_a,
transpose_b,
A->is_weight(),
B->is_weight(),
c_ptr_base,
context_->workspace()->GetScratchBuffer(D));
return MACE_SUCCESS;
}
std::unique_ptr<Tensor> B_transpose_;
SGemm sgemm_;
};
template <>
struct MatMulFunctor<CPU, uint8_t> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
void MatMulImpl(const Tensor *A,
const Tensor *B,
......@@ -213,6 +209,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
......
......@@ -22,6 +22,7 @@
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
namespace gemmlowp {
......@@ -107,6 +108,28 @@ void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
}
}
void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
mace::testing::StopTiming();
std::vector<float> lhs(m * k);
std::vector<float> rhs(k * n);
std::vector<float> result(m * n);
kernels::MatrixMap<const float> matrix_lhs(1, m, k, RowMajor, lhs.data(),
true);
kernels::MatrixMap<const float> matrix_rhs(1, k, n, RowMajor, rhs.data(),
true);
kernels::MatrixMap<float> matrix_result(1, m, n, RowMajor, result.data());
kernels::SGemm sgemm;
sgemm(matrix_lhs, matrix_rhs, &matrix_result);
mace::testing::StartTiming();
while (iters--) {
sgemm(matrix_lhs, matrix_rhs, &matrix_result);
}
}
void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
mace::testing::StopTiming();
Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
......@@ -202,6 +225,7 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
#define MACE_BM_MATMUL(M, K, N) \
MACE_BM_MATMUL_FUNC(M, K, N, Mace, float); \
MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float); \
MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float); \
MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
......@@ -215,15 +239,43 @@ MACE_BM_MATMUL(15, 384, 384);
MACE_BM_MATMUL(15, 384, 1536);
MACE_BM_MATMUL(15, 1536, 384);
MACE_BM_MATMUL(1, 384, 384);
MACE_BM_MATMUL(1, 384, 1536);
MACE_BM_MATMUL(1, 1536, 384);
MACE_BM_MATMUL(1, 384, 44678);
MACE_BM_MATMUL(1, 256, 256);
MACE_BM_MATMUL(1, 256, 1536);
MACE_BM_MATMUL(1, 1536, 256);
MACE_BM_MATMUL(256, 256, 1);
MACE_BM_MATMUL(1536, 256, 1);
MACE_BM_MATMUL(256, 1536, 1);
MACE_BM_MATMUL(29792, 256, 1);
MACE_BM_MATMUL(1, 256, 29792);
MACE_BM_MATMUL(2, 256, 256);
MACE_BM_MATMUL(2, 256, 1536);
MACE_BM_MATMUL(2, 1536, 256);
MACE_BM_MATMUL(3, 256, 256);
MACE_BM_MATMUL(3, 256, 1536);
MACE_BM_MATMUL(3, 1536, 256);
MACE_BM_MATMUL(4, 256, 256);
MACE_BM_MATMUL(4, 256, 1536);
MACE_BM_MATMUL(4, 1536, 256);
MACE_BM_MATMUL(8, 256, 256);
MACE_BM_MATMUL(8, 256, 1536);
MACE_BM_MATMUL(8, 1536, 256);
MACE_BM_MATMUL(10, 256, 256);
MACE_BM_MATMUL(10, 256, 1536);
MACE_BM_MATMUL(10, 1536, 256);
MACE_BM_MATMUL(15, 256, 256);
MACE_BM_MATMUL(15, 256, 1536);
MACE_BM_MATMUL(15, 1536, 256);
// Embedding size 128
MACE_BM_MATMUL(1, 128, 1536);
MACE_BM_MATMUL(1, 128, 44678);
// MobileNet
MACE_BM_MATMUL(128, 128, 3136);
MACE_BM_MATMUL(256, 256, 784);
MACE_BM_MATMUL(512, 512, 196);
MACE_BM_MATMUL(1024, 1024, 49);
} // namespace test
} // namespace kernels
} // namespace mace
此差异已折叠。
......@@ -15,6 +15,9 @@
#ifndef MACE_KERNELS_SGEMM_H_
#define MACE_KERNELS_SGEMM_H_
#include <memory>
#include <utility>
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
......@@ -34,22 +37,29 @@ enum Major {
template<typename T>
class MatrixMap {
public:
MatrixMap(const index_t row,
MatrixMap() {}
MatrixMap(const index_t batch,
const index_t row,
const index_t col,
const Major major,
T *data) :
T *data,
const bool is_const = false) :
batch_(batch),
row_(row),
col_(col),
stride_(major == RowMajor ? col : row),
major_(major),
data_(data) {}
MatrixMap<T> transpose(const MatrixMap<T> &matrix_map) {
Major transpose_major = matrix_map.major_ == RowMajor ? ColMajor : RowMajor;
return MatrixMap<T>(matrix_map.col_,
matrix_map.row_,
transpose_major,
matrix_map.data_);
data_(data),
is_const_(is_const) {}
MatrixMap transpose() const {
Major transpose_major = major_ == RowMajor ? ColMajor : RowMajor;
return MatrixMap(batch_, col_, row_, transpose_major, data_, is_const_);
}
index_t batch() const {
return batch_;
}
index_t row() const {
......@@ -72,66 +82,100 @@ class MatrixMap {
return data_;
}
T *data(int row, int col) const {
return data_ + row * stride_ + col;
T *batch_data(index_t batch) const {
return data_ + batch * row_ * col_;
}
index_t size() const {
return batch_ * row_ * col_;
}
bool is_const() const {
return is_const_;
}
private:
index_t batch_;
index_t row_;
index_t col_;
index_t stride_;
Major major_;
T *data_;
bool is_const_;
};
typedef Major PackOrder;
template<typename T>
class PackedBlock {
public:
PackedBlock() : data_tensor_(GetCPUAllocator(),
DataTypeToEnum<T>::v()) {}
const T *data() {
return data_tensor_.data<T>();
}
T *mutable_data() {
return data_tensor_.mutable_data<T>();
}
Tensor *tensor() {
return &data_tensor_;
}
private:
Tensor data_tensor_;
};
typedef Tensor PackedBlock;
class SGemm {
public:
void operator()(const MatrixMap<float> &lhs,
const MatrixMap<float> &rhs,
MatrixMap<float> *result);
void operator()(const PackedBlock<float> &lhs,
const PackedBlock<float> &rhs,
const index_t height,
const index_t depth,
const index_t width,
PackedBlock<float> *result);
void PackLhs(const MatrixMap<float> &lhs, PackedBlock<float> *packed_block);
void PackRhs(const MatrixMap<float> &rhs, PackedBlock<float> *packed_block);
void UnPack(const PackedBlock<float> &packed_result,
SGemm()
: packed_lhs_(nullptr),
packed_rhs_(nullptr),
packed_(false) {}
void operator()(const MatrixMap<const float> &lhs,
const MatrixMap<const float> &rhs,
MatrixMap<float> *result,
ScratchBuffer *scratch_buffer = nullptr);
void Run(const float *A,
const float *B,
const index_t batch,
const index_t height_a,
const index_t width_a,
const index_t height_b,
const index_t width_b,
const bool transpose_a,
const bool transpose_b,
const bool is_a_weight,
const bool is_b_weight,
float *C,
ScratchBuffer *scratch_buffer = nullptr);
void PackLhs(const MatrixMap<const float> &lhs,
PackedBlock *packed_block);
void PackRhs(const MatrixMap<const float> &rhs,
PackedBlock *packed_block);
void UnPack(const PackedBlock &packed_result,
MatrixMap<float> *matrix_map);
private:
void Pack(const MatrixMap<float> &src,
void Pack(const MatrixMap<const float> &src,
const PackOrder order,
PackedBlock<float> *packed_block);
PackedBlock *packed_block);
void PackPerBatch(const MatrixMap<const float> &src,
const PackOrder order,
const index_t batch_index,
float *packed_data);
void UnPackPerBatch(const float *packed_data,
const index_t batch_index,
MatrixMap<float> *matrix_map);
void RunInternal(const PackedBlock &lhs,
const PackedBlock &rhs,
const index_t batch,
const index_t height,
const index_t depth,
const index_t width,
PackedBlock *result);
void RunPerBatch(const float *lhs,
const float *rhs,
const index_t height,
const index_t depth,
const index_t width,
float *result);
std::unique_ptr<Tensor> packed_lhs_;
std::unique_ptr<Tensor> packed_rhs_;
std::unique_ptr<Tensor> packed_result_;
bool packed_;
};
} // namespace kernels
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <random>
#include <vector>
#include "mace/kernels/sgemm.h"
namespace mace {
namespace kernels {
namespace test {
namespace {
void TestPack(const std::vector<float> &data,
const std::vector<float> &expected_data,
const index_t height,
const index_t width,
Major src_order,
PackOrder pack_order) {
SGemm sg;
MatrixMap<const float> src_matrix(1, height, width, src_order, data.data());
PackedBlock packed;
packed.Resize({height, width});
if (pack_order == PackOrder::ColMajor) {
sg.PackLhs(src_matrix, &packed);
} else {
sg.PackRhs(src_matrix, &packed);
}
auto packed_data = packed.data<float>();
for (index_t i = 0; i < packed.size(); ++i) {
EXPECT_EQ(expected_data[i], packed_data[i]);
}
}
void TestUnPack(const index_t height,
const index_t width,
Major src_order,
PackOrder pack_order) {
static auto seed = static_cast<unsigned int>(time(nullptr));
const index_t matrix_size = height * width;
std::vector<float> data(matrix_size);
for (int i = 0; i < matrix_size; ++i) {
data[i] = rand_r(&seed);
}
MatrixMap<const float> src_matrix(1, height, width, src_order, data.data());
PackedBlock packed;
packed.Resize({height, width});
SGemm sg;
if (pack_order == PackOrder::ColMajor) {
sg.PackLhs(src_matrix, &packed);
} else {
sg.PackRhs(src_matrix, &packed);
}
std::vector<float> unpacked(matrix_size);
MatrixMap<float>
unpacked_matrix(1, height, width, src_order, unpacked.data());
sg.UnPack(packed, &unpacked_matrix);
auto unpacked_data = unpacked.data();
for (index_t i = 0; i < packed.size(); ++i) {
EXPECT_EQ(data[i], unpacked_data[i]);
}
}
} // namespace
TEST(SGemmPackTest, Pack) {
std::vector<float> data =
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36};
// For no-transpose lhs
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
3, 4, Major::RowMajor, PackOrder::ColMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
4, 4, Major::RowMajor, PackOrder::ColMajor);
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
20},
5, 4, Major::RowMajor, PackOrder::ColMajor);
#if defined(__aarch64__)
TestPack(data,
{1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11,
15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36},
9, 4, Major::RowMajor, PackOrder::ColMajor);
#endif
#endif
// For transpose-needed lhs
TestPack(data,
{1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
3, 4, Major::ColMajor, PackOrder::ColMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
4, 4, Major::ColMajor, PackOrder::ColMajor);
TestPack(data,
{1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
20},
5, 4, Major::ColMajor, PackOrder::ColMajor);
#if defined(__aarch64__)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36},
9, 4, Major::ColMajor, PackOrder::ColMajor);
#endif
#endif
// For no-transpose rhs
TestPack(data,
{1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
4, 3, Major::RowMajor, PackOrder::RowMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
4, 4, Major::RowMajor, PackOrder::RowMajor);
TestPack(data,
{1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
20},
4, 5, Major::RowMajor, PackOrder::RowMajor);
#endif
// For transpose-needed rhs
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
4, 3, Major::ColMajor, PackOrder::RowMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
4, 4, Major::ColMajor, PackOrder::RowMajor);
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
20},
4, 5, Major::ColMajor, PackOrder::RowMajor);
#endif
}
TEST(SGemmPackTest, UnPack) {
TestUnPack(4, 3, Major::RowMajor, PackOrder::RowMajor);
TestUnPack(4, 4, Major::RowMajor, PackOrder::RowMajor);
TestUnPack(4, 5, Major::RowMajor, PackOrder::RowMajor);
TestUnPack(4, 100, Major::RowMajor, PackOrder::RowMajor);
TestUnPack(4, 3, Major::ColMajor, PackOrder::RowMajor);
TestUnPack(4, 4, Major::ColMajor, PackOrder::RowMajor);
TestUnPack(4, 5, Major::ColMajor, PackOrder::RowMajor);
TestUnPack(4, 100, Major::ColMajor, PackOrder::RowMajor);
}
} // namespace test
} // namespace kernels
} // namespace mace
......@@ -40,7 +40,11 @@ class MatMulOp : public Operator<D, T> {
"than or equal to 2");
index_t rank = A->dim_size();
for (index_t i = 0; i < rank - 2; ++i) {
MACE_CHECK(A->dim(i) == B->dim(i), "batch dimensions are not equal");
MACE_CHECK(A->dim(i) == B->dim(i),
"batch dimensions are not equal: ",
A->dim(i),
" vs. ",
B->dim(i));
}
index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);
......
......@@ -33,13 +33,15 @@ void MatMulBenchmark(
// Add input data
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, T>("B", {batch, channels, out_width});
net.GetTensor("A")->SetIsWeight(true);
net.GetTensor("B")->SetIsWeight(true);
if (DataTypeToEnum<T>::value == DT_UINT8) {
net.GetTensor("A")->SetScale(0.1);
net.GetTensor("B")->SetScale(0.1);
}
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(&net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
......@@ -71,7 +73,7 @@ void MatMulBenchmark(
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
}
net.Sync();
}
......@@ -86,6 +88,8 @@ void MatMulTransposeBenchmark(
// Add input data
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, T>("B", {batch, out_width, channels});
net.GetTensor("A")->SetIsWeight(true);
net.GetTensor("B")->SetIsWeight(true);
if (DataTypeToEnum<T>::value == DT_UINT8) {
net.GetTensor("A")->SetScale(0.1);
net.GetTensor("B")->SetScale(0.1);
......@@ -116,7 +120,7 @@ void MatMulTransposeBenchmark(
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
}
net.Sync();
}
......@@ -154,10 +158,15 @@ void MatMulTransposeBenchmark(
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
MACE_BM_MATMUL(1, 128, 128, 49);
MACE_BM_MATMUL(2, 128, 128, 49);
MACE_BM_MATMUL(3, 128, 128, 49);
MACE_BM_MATMUL(4, 128, 128, 49);
MACE_BM_MATMUL(16, 32, 128, 49);
MACE_BM_MATMUL(16, 32, 128, 961);
MACE_BM_MATMUL(16, 32, 128, 3969);
MACE_BM_MATMUL(16, 128, 128, 49);
MACE_BM_MATMUL(16, 49, 128, 128);
MACE_BM_MATMUL(16, 128, 128, 961);
MACE_BM_MATMUL(16, 128, 128, 3969);
......
......@@ -211,8 +211,8 @@ void WinoMatMulBenchmark(
const index_t round_w = (width + block_size - 1) / block_size;
const index_t out_width = round_h * round_w;
// Add input data
net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels});
net.AddRandomInput<D, float>("B", {batch, in_channels, out_width});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册