Merge branch 'pack' into 'master'

Pack matmul to improve performance See merge request !789

Merge branch 'pack' into 'master'
Pack matmul to improve performance See merge request !789
57a3298d · 李滨 · b50a6635 · 26592a86 · 57a3298d · 57a3298d
12 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -85,7 +85,7 @@ ndk_versions_compatible_tests:
    - DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
    - prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
    - >
-      for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
+      for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
      do
      new_ndk_path=${prefix_path}${ndk};
      if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -399,6 +399,10 @@ class Tensor {
    zero_point_ = zero_point;
  }

+  inline void SetIsWeight(bool is_weight) {
+    is_weight_ = is_weight;
+  }
+
 private:
  Allocator *allocator_;
  DataType dtype_;
@@ -409,7 +413,7 @@ class Tensor {
  bool is_buffer_owner_;
  bool unused_;
  std::string name_;
-  const bool is_weight_;
+  bool is_weight_;
  float scale_;
  int32_t zero_point_;


--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -33,7 +33,8 @@ int main(int argc, char **argv) {
  // config runtime
  mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy(
      FLAGS_omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
+      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      true);
  if (status != mace::MACE_SUCCESS) {
    LOG(WARNING) << "Set openmp or cpu affinity failed.";
  }

--- a/mace/kernels/gemm_test.cc
+++ b/mace/kernels/gemm_test.cc
@@ -13,11 +13,13 @@
 // limitations under the License.

 #include <gtest/gtest.h>
+#include <vector>
 #include <memory>
 #include <random>

 #include "mace/core/types.h"
 #include "mace/kernels/gemm.h"
+#include "mace/kernels/sgemm.h"

 namespace mace {

@@ -72,6 +74,74 @@ void GemvTest(index_t batch, index_t N, index_t M) {
  }
 }

+void SGemmTest(index_t batch,
+               index_t N,
+               index_t K,
+               index_t M,
+               bool transpose_a,
+               bool transpose_b) {
+  std::unique_ptr<float[]> A(new float[batch * N * K]);
+  std::unique_ptr<float[]> B(new float[batch * K * M]);
+  std::unique_ptr<float[]> C(new float[batch * N * M]);
+  std::unique_ptr<float[]> C_ref(new float[batch * N * M]);
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> nd(0, 1);
+
+  std::generate(A.get(), A.get() + batch * N * K,
+                [&gen, &nd] { return nd(gen); });
+  std::generate(B.get(), B.get() + batch * K * M,
+                [&gen, &nd] { return nd(gen); });
+  kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
+                   transpose_b);
+
+  kernels::MatrixMap<const float> matrix_a;
+  kernels::MatrixMap<const float> matrix_b;
+
+  if (!transpose_a) {
+    matrix_a =
+        kernels::MatrixMap<const float>(batch,
+                                        N,
+                                        K,
+                                        kernels::RowMajor,
+                                        A.get());
+  } else {
+    matrix_a =
+        kernels::MatrixMap<const float>(batch,
+                                        K,
+                                        N,
+                                        kernels::RowMajor,
+                                        A.get());
+    matrix_a = matrix_a.transpose();
+  }
+
+  if (!transpose_b) {
+    matrix_b =
+        kernels::MatrixMap<const float>(batch,
+                                        K,
+                                        M,
+                                        kernels::RowMajor,
+                                        B.get());
+  } else {
+    matrix_b =
+        kernels::MatrixMap<const float>(batch,
+                                        M,
+                                        K,
+                                        kernels::RowMajor,
+                                        B.get());
+    matrix_b = matrix_b.transpose();
+  }
+  kernels::MatrixMap<float> matrix_c(batch, N, M, kernels::RowMajor, C.get());
+
+  kernels::SGemm sgemm;
+  sgemm(matrix_a, matrix_b, &matrix_c);
+
+  for (int i = 0; i < N * M; ++i) {
+    EXPECT_NEAR(C_ref[i], C[i], 0.1);
+  }
+}
+
 }  // namespace

 TEST(GEMMTest, AlignedWithoutBatch) {
@@ -114,4 +184,25 @@ TEST(GEMMTest, gemv) {
  GemvTest(3, 17, 63);
 }

+namespace {
+void TestSGemmTranspose(index_t batch, index_t N, index_t K, index_t M) {
+  SGemmTest(batch, N, K, M, false, false);
+  SGemmTest(batch, N, K, M, true, false);
+  SGemmTest(batch, N, K, M, false, true);
+  SGemmTest(batch, N, K, M, true, true);
+}
+}
+
+TEST(SGEMMTest, UnalignedWithoutBatch) {
+  std::vector<index_t> tests{1, 5, 14, 31, 47};
+  for (index_t N : tests) {
+    for (index_t K : tests) {
+      for (index_t M : tests) {
+        TestSGemmTranspose(1, N, K, M);
+        TestSGemmTranspose(16, N, K, M);
+      }
+    }
+  }
+}
+
 }  // namespace mace
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -32,6 +32,7 @@
 #include "mace/kernels/kernel.h"
 #include "mace/utils/utils.h"
 #include "mace/kernels/gemmlowp_util.h"
+#include "mace/kernels/sgemm.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -83,39 +84,34 @@ struct MatMulFunctor : OpKernel {
    const T *b_ptr_base = B->data<T>();
    T *c_ptr_base = C->mutable_data<T>();

-    memset(c_ptr_base, 0, batch * height * width * sizeof(T));
-
-    if (height == 1 && width > 1 && B->is_weight()) {
-      // A * B = (B^T * A^T)^T
-      if (!transpose_b) {
-        if (B_transpose_.get() == nullptr) {
-          B_transpose_.reset(new Tensor(context_->device()->allocator(),
-                                        DataTypeToEnum<T>::v()));
-          B_transpose_->Resize({batch, width, K});
-          Tensor::MappingGuard guardbt(B_transpose_.get());
-          T *bt_ptr_base = B_transpose_->mutable_data<T>();
-          Transpose(b_ptr_base, K, width, width, bt_ptr_base);
-        }
-        Tensor::MappingGuard guardbt(B_transpose_.get());
-        T *bt_ptr_base = B_transpose_->mutable_data<T>();
-        Gemv(bt_ptr_base, a_ptr_base, batch, K, width, c_ptr_base);
-      } else {
-        Gemv(b_ptr_base, a_ptr_base, batch, K, width, c_ptr_base);
-      }
-    } else {
-      Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base,
-           transpose_a, transpose_b);
-    }
-
+    const index_t height_a = A->dim(rank - 2);
+    const index_t width_a = A->dim(rank - 1);
+    const index_t height_b = B->dim(rank - 2);
+    const index_t width_b = B->dim(rank - 1);
+
+    sgemm_.Run(a_ptr_base,
+               b_ptr_base,
+               batch,
+               height_a,
+               width_a,
+               height_b,
+               width_b,
+               transpose_a,
+               transpose_b,
+               A->is_weight(),
+               B->is_weight(),
+               c_ptr_base,
+               context_->workspace()->GetScratchBuffer(D));
    return MACE_SUCCESS;
  }

-  std::unique_ptr<Tensor> B_transpose_;
+  SGemm sgemm_;
 };

 template <>
 struct MatMulFunctor<CPU, uint8_t> : OpKernel {
  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
+
  template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
  void MatMulImpl(const Tensor *A,
                  const Tensor *B,
@@ -213,6 +209,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
 template <typename T>
 struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
+
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
                        Tensor *C,

--- a/mace/kernels/matmul_benchmark.cc
+++ b/mace/kernels/matmul_benchmark.cc
@@ -22,6 +22,7 @@
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/gemm.h"
 #include "mace/kernels/gemmlowp_util.h"
+#include "mace/kernels/sgemm.h"

 namespace gemmlowp {

@@ -107,6 +108,28 @@ void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
  }
 }

+void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  std::vector<float> lhs(m * k);
+  std::vector<float> rhs(k * n);
+  std::vector<float> result(m * n);
+
+  kernels::MatrixMap<const float> matrix_lhs(1, m, k, RowMajor, lhs.data(),
+                                             true);
+  kernels::MatrixMap<const float> matrix_rhs(1, k, n, RowMajor, rhs.data(),
+                                             true);
+  kernels::MatrixMap<float> matrix_result(1, m, n, RowMajor, result.data());
+
+  kernels::SGemm sgemm;
+
+  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
+  }
+}
+
 void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
  mace::testing::StopTiming();
  Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
@@ -202,6 +225,7 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {

 #define MACE_BM_MATMUL(M, K, N)                          \
  MACE_BM_MATMUL_FUNC(M, K, N, Mace, float);             \
+  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
@@ -215,15 +239,43 @@ MACE_BM_MATMUL(15, 384, 384);
 MACE_BM_MATMUL(15, 384, 1536);
 MACE_BM_MATMUL(15, 1536, 384);

-MACE_BM_MATMUL(1, 384, 384);
-MACE_BM_MATMUL(1, 384, 1536);
-MACE_BM_MATMUL(1, 1536, 384);
-MACE_BM_MATMUL(1, 384, 44678);
+MACE_BM_MATMUL(1, 256, 256);
+MACE_BM_MATMUL(1, 256, 1536);
+MACE_BM_MATMUL(1, 1536, 256);
+MACE_BM_MATMUL(256, 256, 1);
+MACE_BM_MATMUL(1536, 256, 1);
+MACE_BM_MATMUL(256, 1536, 1);
+MACE_BM_MATMUL(29792, 256, 1);
+MACE_BM_MATMUL(1, 256, 29792);
+MACE_BM_MATMUL(2, 256, 256);
+MACE_BM_MATMUL(2, 256, 1536);
+MACE_BM_MATMUL(2, 1536, 256);
+MACE_BM_MATMUL(3, 256, 256);
+MACE_BM_MATMUL(3, 256, 1536);
+MACE_BM_MATMUL(3, 1536, 256);
+MACE_BM_MATMUL(4, 256, 256);
+MACE_BM_MATMUL(4, 256, 1536);
+MACE_BM_MATMUL(4, 1536, 256);
+MACE_BM_MATMUL(8, 256, 256);
+MACE_BM_MATMUL(8, 256, 1536);
+MACE_BM_MATMUL(8, 1536, 256);
+MACE_BM_MATMUL(10, 256, 256);
+MACE_BM_MATMUL(10, 256, 1536);
+MACE_BM_MATMUL(10, 1536, 256);
+MACE_BM_MATMUL(15, 256, 256);
+MACE_BM_MATMUL(15, 256, 1536);
+MACE_BM_MATMUL(15, 1536, 256);

 // Embedding size 128
 MACE_BM_MATMUL(1, 128, 1536);
 MACE_BM_MATMUL(1, 128, 44678);

+// MobileNet
+MACE_BM_MATMUL(128, 128, 3136);
+MACE_BM_MATMUL(256, 256, 784);
+MACE_BM_MATMUL(512, 512, 196);
+MACE_BM_MATMUL(1024, 1024, 49);
+
 }  // namespace test
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/sgemm.cc
+++ b/mace/kernels/sgemm.cc
--- a/mace/kernels/sgemm.h
+++ b/mace/kernels/sgemm.h
@@ -15,6 +15,9 @@
 #ifndef MACE_KERNELS_SGEMM_H_
 #define MACE_KERNELS_SGEMM_H_

+#include <memory>
+#include <utility>
+
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
@@ -34,22 +37,29 @@ enum Major {
 template<typename T>
 class MatrixMap {
 public:
-  MatrixMap(const index_t row,
+  MatrixMap() {}
+
+  MatrixMap(const index_t batch,
+            const index_t row,
            const index_t col,
            const Major major,
-            T *data) :
+            T *data,
+            const bool is_const = false) :
+      batch_(batch),
      row_(row),
      col_(col),
      stride_(major == RowMajor ? col : row),
      major_(major),
-      data_(data) {}
-
-  MatrixMap<T> transpose(const MatrixMap<T> &matrix_map) {
-    Major transpose_major = matrix_map.major_ == RowMajor ? ColMajor : RowMajor;
-    return MatrixMap<T>(matrix_map.col_,
-                        matrix_map.row_,
-                        transpose_major,
-                        matrix_map.data_);
+      data_(data),
+      is_const_(is_const) {}
+
+  MatrixMap transpose() const {
+    Major transpose_major = major_ == RowMajor ? ColMajor : RowMajor;
+    return MatrixMap(batch_, col_, row_, transpose_major, data_, is_const_);
+  }
+
+  index_t batch() const {
+    return batch_;
  }

  index_t row() const {
@@ -72,66 +82,100 @@ class MatrixMap {
    return data_;
  }

-  T *data(int row, int col) const {
-    return data_ + row * stride_ + col;
+  T *batch_data(index_t batch) const {
+    return data_ + batch * row_ * col_;
+  }
+
+  index_t size() const {
+    return batch_ * row_ * col_;
+  }
+
+  bool is_const() const {
+    return is_const_;
  }

 private:
+  index_t batch_;
  index_t row_;
  index_t col_;
  index_t stride_;
  Major major_;
  T *data_;
+  bool is_const_;
 };

 typedef Major PackOrder;
-
-template<typename T>
-class PackedBlock {
- public:
-  PackedBlock() : data_tensor_(GetCPUAllocator(),
-                               DataTypeToEnum<T>::v()) {}
-
-  const T *data() {
-    return data_tensor_.data<T>();
-  }
-
-  T *mutable_data() {
-    return data_tensor_.mutable_data<T>();
-  }
-
-  Tensor *tensor() {
-    return &data_tensor_;
-  }
-
- private:
-  Tensor data_tensor_;
-};
+typedef Tensor PackedBlock;

 class SGemm {
 public:
-  void operator()(const MatrixMap<float> &lhs,
-                  const MatrixMap<float> &rhs,
-                  MatrixMap<float> *result);
-
-  void operator()(const PackedBlock<float> &lhs,
-                  const PackedBlock<float> &rhs,
-                  const index_t height,
-                  const index_t depth,
-                  const index_t width,
-                  PackedBlock<float> *result);
-
-  void PackLhs(const MatrixMap<float> &lhs, PackedBlock<float> *packed_block);
-
-  void PackRhs(const MatrixMap<float> &rhs, PackedBlock<float> *packed_block);
-
-  void UnPack(const PackedBlock<float> &packed_result,
+  SGemm()
+      : packed_lhs_(nullptr),
+        packed_rhs_(nullptr),
+        packed_(false) {}
+
+  void operator()(const MatrixMap<const float> &lhs,
+                  const MatrixMap<const float> &rhs,
+                  MatrixMap<float> *result,
+                  ScratchBuffer *scratch_buffer = nullptr);
+
+  void Run(const float *A,
+           const float *B,
+           const index_t batch,
+           const index_t height_a,
+           const index_t width_a,
+           const index_t height_b,
+           const index_t width_b,
+           const bool transpose_a,
+           const bool transpose_b,
+           const bool is_a_weight,
+           const bool is_b_weight,
+           float *C,
+           ScratchBuffer *scratch_buffer = nullptr);
+
+  void PackLhs(const MatrixMap<const float> &lhs,
+               PackedBlock *packed_block);
+
+  void PackRhs(const MatrixMap<const float> &rhs,
+               PackedBlock *packed_block);
+
+  void UnPack(const PackedBlock &packed_result,
              MatrixMap<float> *matrix_map);

 private:
-  void Pack(const MatrixMap<float> &src,
+  void Pack(const MatrixMap<const float> &src,
            const PackOrder order,
-            PackedBlock<float> *packed_block);
+            PackedBlock *packed_block);
+
+  void PackPerBatch(const MatrixMap<const float> &src,
+                    const PackOrder order,
+                    const index_t batch_index,
+                    float *packed_data);
+
+  void UnPackPerBatch(const float *packed_data,
+                      const index_t batch_index,
+                      MatrixMap<float> *matrix_map);
+
+  void RunInternal(const PackedBlock &lhs,
+                   const PackedBlock &rhs,
+                   const index_t batch,
+                   const index_t height,
+                   const index_t depth,
+                   const index_t width,
+                   PackedBlock *result);
+
+  void RunPerBatch(const float *lhs,
+                   const float *rhs,
+                   const index_t height,
+                   const index_t depth,
+                   const index_t width,
+                   float *result);
+
+  std::unique_ptr<Tensor> packed_lhs_;
+  std::unique_ptr<Tensor> packed_rhs_;
+  std::unique_ptr<Tensor> packed_result_;
+
+  bool packed_;
 };

 }  // namespace kernels

--- a/mace/kernels/sgemm_pack_test.cc
+++ b/mace/kernels/sgemm_pack_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#include "mace/kernels/sgemm.h"
+
+namespace mace {
+namespace kernels {
+namespace test {
+
+namespace {
+void TestPack(const std::vector<float> &data,
+              const std::vector<float> &expected_data,
+              const index_t height,
+              const index_t width,
+              Major src_order,
+              PackOrder pack_order) {
+  SGemm sg;
+  MatrixMap<const float> src_matrix(1, height, width, src_order, data.data());
+  PackedBlock packed;
+  packed.Resize({height, width});
+  if (pack_order == PackOrder::ColMajor) {
+    sg.PackLhs(src_matrix, &packed);
+  } else {
+    sg.PackRhs(src_matrix, &packed);
+  }
+
+  auto packed_data = packed.data<float>();
+  for (index_t i = 0; i < packed.size(); ++i) {
+    EXPECT_EQ(expected_data[i], packed_data[i]);
+  }
+}
+
+void TestUnPack(const index_t height,
+                const index_t width,
+                Major src_order,
+                PackOrder pack_order) {
+  static auto seed = static_cast<unsigned int>(time(nullptr));
+  const index_t matrix_size = height * width;
+  std::vector<float> data(matrix_size);
+  for (int i = 0; i < matrix_size; ++i) {
+    data[i] = rand_r(&seed);
+  }
+
+  MatrixMap<const float> src_matrix(1, height, width, src_order, data.data());
+  PackedBlock packed;
+  packed.Resize({height, width});
+  SGemm sg;
+  if (pack_order == PackOrder::ColMajor) {
+    sg.PackLhs(src_matrix, &packed);
+  } else {
+    sg.PackRhs(src_matrix, &packed);
+  }
+
+  std::vector<float> unpacked(matrix_size);
+  MatrixMap<float>
+      unpacked_matrix(1, height, width, src_order, unpacked.data());
+  sg.UnPack(packed, &unpacked_matrix);
+  auto unpacked_data = unpacked.data();
+  for (index_t i = 0; i < packed.size(); ++i) {
+    EXPECT_EQ(data[i], unpacked_data[i]);
+  }
+}
+}  // namespace
+
+
+TEST(SGemmPackTest, Pack) {
+  std::vector<float> data =
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36};
+
+  // For no-transpose lhs
+  TestPack(data,
+           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+           3, 4, Major::RowMajor, PackOrder::ColMajor);
+#if defined(MACE_ENABLE_NEON)
+  TestPack(data,
+           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
+           4, 4, Major::RowMajor, PackOrder::ColMajor);
+  TestPack(data,
+           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
+            20},
+           5, 4, Major::RowMajor, PackOrder::ColMajor);
+#if defined(__aarch64__)
+  TestPack(data,
+           {1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11,
+            15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36},
+           9, 4, Major::RowMajor, PackOrder::ColMajor);
+#endif
+#endif
+  // For transpose-needed lhs
+  TestPack(data,
+           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
+           3, 4, Major::ColMajor, PackOrder::ColMajor);
+#if defined(MACE_ENABLE_NEON)
+  TestPack(data,
+           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+           4, 4, Major::ColMajor, PackOrder::ColMajor);
+  TestPack(data,
+           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
+            20},
+           5, 4, Major::ColMajor, PackOrder::ColMajor);
+#if defined(__aarch64__)
+  TestPack(data,
+           {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
+            22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36},
+           9, 4, Major::ColMajor, PackOrder::ColMajor);
+#endif
+#endif
+  // For no-transpose rhs
+  TestPack(data,
+           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
+           4, 3, Major::RowMajor, PackOrder::RowMajor);
+#if defined(MACE_ENABLE_NEON)
+  TestPack(data,
+           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+           4, 4, Major::RowMajor, PackOrder::RowMajor);
+  TestPack(data,
+           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
+            20},
+           4, 5, Major::RowMajor, PackOrder::RowMajor);
+#endif
+  // For transpose-needed rhs
+  TestPack(data,
+           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+           4, 3, Major::ColMajor, PackOrder::RowMajor);
+#if defined(MACE_ENABLE_NEON)
+  TestPack(data,
+           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
+           4, 4, Major::ColMajor, PackOrder::RowMajor);
+  TestPack(data,
+           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
+            20},
+           4, 5, Major::ColMajor, PackOrder::RowMajor);
+#endif
+}
+
+TEST(SGemmPackTest, UnPack) {
+  TestUnPack(4, 3, Major::RowMajor, PackOrder::RowMajor);
+  TestUnPack(4, 4, Major::RowMajor, PackOrder::RowMajor);
+  TestUnPack(4, 5, Major::RowMajor, PackOrder::RowMajor);
+  TestUnPack(4, 100, Major::RowMajor, PackOrder::RowMajor);
+  TestUnPack(4, 3, Major::ColMajor, PackOrder::RowMajor);
+  TestUnPack(4, 4, Major::ColMajor, PackOrder::RowMajor);
+  TestUnPack(4, 5, Major::ColMajor, PackOrder::RowMajor);
+  TestUnPack(4, 100, Major::ColMajor, PackOrder::RowMajor);
+}
+
+}  // namespace test
+}  // namespace kernels
+}  // namespace mace
+
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -40,7 +40,11 @@ class MatMulOp : public Operator<D, T> {
               "than or equal to 2");
    index_t rank = A->dim_size();
    for (index_t i = 0; i < rank - 2; ++i) {
-      MACE_CHECK(A->dim(i) == B->dim(i), "batch dimensions are not equal");
+      MACE_CHECK(A->dim(i) == B->dim(i),
+                 "batch dimensions are not equal: ",
+                 A->dim(i),
+                 " vs. ",
+                 B->dim(i));
    }
    index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
    index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -33,13 +33,15 @@ void MatMulBenchmark(
  // Add input data
  net.AddRandomInput<D, T>("A", {batch, height, channels});
  net.AddRandomInput<D, T>("B", {batch, channels, out_width});
+  net.GetTensor("A")->SetIsWeight(true);
+  net.GetTensor("B")->SetIsWeight(true);
  if (DataTypeToEnum<T>::value == DT_UINT8) {
    net.GetTensor("A")->SetScale(0.1);
    net.GetTensor("B")->SetScale(0.1);
  }
-
  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(&net, "A", "AImage",
+                        kernels::BufferType::IN_OUT_WIDTH);
    BufferToImage<D, T>(&net, "B", "BImage",
                        kernels::BufferType::IN_OUT_HEIGHT);

@@ -71,7 +73,7 @@ void MatMulBenchmark(

  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
@@ -86,6 +88,8 @@ void MatMulTransposeBenchmark(
  // Add input data
  net.AddRandomInput<D, T>("A", {batch, height, channels});
  net.AddRandomInput<D, T>("B", {batch, out_width, channels});
+  net.GetTensor("A")->SetIsWeight(true);
+  net.GetTensor("B")->SetIsWeight(true);
  if (DataTypeToEnum<T>::value == DT_UINT8) {
    net.GetTensor("A")->SetScale(0.1);
    net.GetTensor("B")->SetScale(0.1);
@@ -116,7 +120,7 @@ void MatMulTransposeBenchmark(

  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
@@ -154,10 +158,15 @@ void MatMulTransposeBenchmark(
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);

+MACE_BM_MATMUL(1, 128, 128, 49);
+MACE_BM_MATMUL(2, 128, 128, 49);
+MACE_BM_MATMUL(3, 128, 128, 49);
+MACE_BM_MATMUL(4, 128, 128, 49);
 MACE_BM_MATMUL(16, 32, 128, 49);
 MACE_BM_MATMUL(16, 32, 128, 961);
 MACE_BM_MATMUL(16, 32, 128, 3969);
 MACE_BM_MATMUL(16, 128, 128, 49);
+MACE_BM_MATMUL(16, 49, 128, 128);
 MACE_BM_MATMUL(16, 128, 128, 961);
 MACE_BM_MATMUL(16, 128, 128, 3969);


--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -211,8 +211,8 @@ void WinoMatMulBenchmark(
  const index_t round_w = (width + block_size - 1) / block_size;
  const index_t out_width = round_h * round_w;
  // Add input data
-  net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
-  net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
+  net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels});
+  net.AddRandomInput<D, float>("B", {batch, in_channels, out_width});

  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);