Gemm transpose

cd506756 · 李寅 · c0c0dfe5 · cd506756 · cd506756 · cd506756
14 changed file
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
--- a/mace/kernels/gemm.h
+++ b/mace/kernels/gemm.h
@@ -30,7 +30,9 @@ void Gemm(const float *A,
          const index_t height,
          const index_t K,
          const index_t width,
-          float *C);
+          float *C,
+          const bool transpose_a = false,
+          const bool transpose_b = false);

 void GemmRef(const float *A,
             const float *B,
@@ -38,7 +40,9 @@ void GemmRef(const float *A,
             const index_t height,
             const index_t K,
             const index_t width,
-             float *C);
+             float *C,
+             const bool transpose_a = false,
+             const bool transpose_b = false);

 void Gemv(const float *m_ptr,
          const float *v_ptr,

--- a/mace/kernels/gemm_test.cc
+++ b/mace/kernels/gemm_test.cc
@@ -13,17 +13,22 @@
 // limitations under the License.

 #include <gtest/gtest.h>
-#include <random>
 #include <memory>
+#include <random>

-#include "mace/kernels/gemm.h"
 #include "mace/core/types.h"
+#include "mace/kernels/gemm.h"

 namespace mace {

 namespace {

-void GemmTest(index_t batch, index_t N, index_t K, index_t M) {
+void GemmTest(index_t batch,
+              index_t N,
+              index_t K,
+              index_t M,
+              bool transpose_a,
+              bool transpose_b) {
  std::unique_ptr<float[]> A(new float[batch * N * K]);
  std::unique_ptr<float[]> B(new float[batch * K * M]);
  std::unique_ptr<float[]> C(new float[batch * N * M]);
@@ -34,15 +39,13 @@ void GemmTest(index_t batch, index_t N, index_t K, index_t M) {
  std::normal_distribution<float> nd(0, 1);

  std::generate(A.get(), A.get() + batch * N * K,
-                [&gen, &nd] {
-                  return nd(gen);
-                });
+                [&gen, &nd] { return nd(gen); });
  std::generate(B.get(), B.get() + batch * K * M,
-                [&gen, &nd] {
-                  return nd(gen);
-                });
-  kernels::Gemm(A.get(), B.get(), batch, N, K, M, C.get());
-  kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get());
+                [&gen, &nd] { return nd(gen); });
+  kernels::Gemm(A.get(), B.get(), batch, N, K, M, C.get(), transpose_a,
+                transpose_b);
+  kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
+                   transpose_b);

  for (int i = 0; i < batch * N * M; ++i) {
    EXPECT_NEAR(C_ref[i], C[i], 0.1);
@@ -59,14 +62,8 @@ void GemvTest(index_t batch, index_t N, index_t M) {
  std::mt19937 gen(rd());
  std::normal_distribution<float> nd(0, 1);

-  std::generate(A.get(), A.get() + N * M,
-                [&gen, &nd] {
-                  return nd(gen);
-                });
-  std::generate(B.get(), B.get() + batch * M,
-                [&gen, &nd] {
-                  return nd(gen);
-                });
+  std::generate(A.get(), A.get() + N * M, [&gen, &nd] { return nd(gen); });
+  std::generate(B.get(), B.get() + batch * M, [&gen, &nd] { return nd(gen); });
  kernels::Gemv(A.get(), B.get(), batch, M, N, C.get());
  kernels::GemvRef(A.get(), B.get(), batch, M, N, C_ref.get());

@@ -78,36 +75,36 @@ void GemvTest(index_t batch, index_t N, index_t M) {
 }  // namespace

 TEST(GEMMTest, AlignedWithoutBatch) {
-  GemmTest(1, 1, 64, 128);
-  GemmTest(1, 2, 64, 128);
-  GemmTest(1, 3, 64, 128);
-  GemmTest(1, 4, 64, 128);
-  GemmTest(1, 5, 64, 128);
-  GemmTest(1, 6, 64, 128);
-  GemmTest(1, 7, 64, 128);
-  GemmTest(1, 17, 64, 128);
+  GemmTest(1, 1, 64, 128, false, false);
+  GemmTest(1, 2, 64, 128, false, true);
+  GemmTest(1, 3, 64, 128, true, false);
+  GemmTest(1, 4, 64, 128, true, true);
+  GemmTest(1, 5, 64, 128, false, false);
+  GemmTest(1, 6, 64, 128, false, true);
+  GemmTest(1, 7, 64, 128, true, false);
+  GemmTest(1, 17, 64, 128, true, true);
 }

 TEST(GEMMTest, UnalignedWithoutBatch) {
-  GemmTest(1, 1, 63, 127);
-  GemmTest(1, 2, 63, 127);
-  GemmTest(1, 3, 63, 127);
-  GemmTest(1, 4, 63, 127);
-  GemmTest(1, 5, 63, 127);
-  GemmTest(1, 6, 63, 127);
-  GemmTest(1, 7, 63, 127);
-  GemmTest(1, 17, 63, 127);
+  GemmTest(1, 1, 63, 127, false, false);
+  GemmTest(1, 2, 63, 127, false, true);
+  GemmTest(1, 3, 63, 127, true, false);
+  GemmTest(1, 4, 63, 127, true, true);
+  GemmTest(1, 5, 63, 127, false, false);
+  GemmTest(1, 6, 63, 127, false, true);
+  GemmTest(1, 7, 63, 127, true, false);
+  GemmTest(1, 17, 63, 127, true, true);
 }

 TEST(GEMMTest, UnalignedWithBatch) {
-  GemmTest(3, 1, 63, 127);
-  GemmTest(3, 2, 63, 127);
-  GemmTest(3, 3, 63, 127);
-  GemmTest(3, 4, 63, 127);
-  GemmTest(3, 5, 63, 127);
-  GemmTest(3, 6, 63, 127);
-  GemmTest(3, 7, 63, 127);
-  GemmTest(3, 17, 63, 127);
+  GemmTest(3, 1, 63, 127, false, false);
+  GemmTest(3, 2, 63, 127, false, true);
+  GemmTest(3, 3, 63, 127, true, false);
+  GemmTest(3, 4, 63, 127, true, true);
+  GemmTest(3, 5, 63, 127, false, false);
+  GemmTest(3, 6, 63, 127, false, true);
+  GemmTest(3, 7, 63, 127, true, false);
+  GemmTest(3, 17, 63, 127, true, true);
 }

 TEST(GEMMTest, gemv) {

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -20,6 +20,8 @@
 #endif

 #include <algorithm>
+#include <utility>
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -36,14 +38,39 @@
 namespace mace {
 namespace kernels {

-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct MatMulFunctor {
  MaceStatus operator()(const Tensor *A,
-                  const Tensor *B,
-                  Tensor *C,
-                  StatsFuture *future) {
+                        const Tensor *B,
+                        Tensor *C,
+                        bool transpose_a,
+                        bool transpose_b,
+                        StatsFuture *future) {
    MACE_UNUSED(future);
-    std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
+
+    index_t batch;
+    index_t height;
+    index_t K;
+    index_t width;
+
+    index_t rank = A->dim_size();
+    height = A->dim(rank - 2);
+    K = A->dim(rank - 1);
+    if (transpose_a) {
+      std::swap(height, K);
+    }
+    if (transpose_b) {
+      width = B->dim(rank - 2);
+    } else {
+      width = B->dim(rank - 1);
+    }
+    batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
+                            std::multiplies<index_t>());
+
+    std::vector<index_t> c_shape = A->shape();
+    c_shape[rank - 2] = height;
+    c_shape[rank - 1] = width;
+
    MACE_RETURN_IF_ERROR(C->Resize(c_shape));

    Tensor::MappingGuard guarda(A);
@@ -53,28 +80,27 @@ struct MatMulFunctor {
    const T *b_ptr_base = B->data<T>();
    T *c_ptr_base = C->mutable_data<T>();

-    const index_t batch = C->dim(0);
-    const index_t height = C->dim(1);
-    const index_t width = C->dim(2);
-    const index_t K = A->dim(2);
    // It is better to use large block size if it fits for fast cache.
    // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
    // the block size should be sqrt(32k / sizeof(T) / 3).
    memset(c_ptr_base, 0, batch * height * width * sizeof(T));

-    Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);
+    Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base,
+         transpose_a, transpose_b);

    return MACE_SUCCESS;
  }
 };

 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
+template <typename T>
 struct MatMulFunctor<DeviceType::GPU, T> {
  MaceStatus operator()(const Tensor *A,
-                  const Tensor *B,
-                  Tensor *C,
-                  StatsFuture *future);
+                        const Tensor *B,
+                        Tensor *C,
+                        bool transpose_a,
+                        bool transpose_b,
+                        StatsFuture *future);

  cl::Kernel kernel_;
  uint32_t kwg_size_;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -134,7 +134,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
  } else {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    if (buffer->dim_size() < 4) {
+      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
+    } else {
+      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    }
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));


--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -76,19 +76,27 @@ void CalWinogradFilterImageShape(
 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
                              std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
+  std::vector<index_t> padded_shape = shape;
+  while (padded_shape.size() < 4) {
+    padded_shape.push_back(1);
+  }
+  MACE_CHECK(padded_shape.size() == 4);
  image_shape->resize(2);
-  (*image_shape)[0] = shape[2] * shape[3];
-  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
+  (*image_shape)[0] = padded_shape[2] * padded_shape[3];
+  (*image_shape)[1] = padded_shape[0] * RoundUpDiv4(padded_shape[1]);
 }

 // [RoundUp<4>(W) * C, N * H]
 void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
                             std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
+  std::vector<index_t> padded_shape = shape;
+  while (padded_shape.size() < 4) {
+    padded_shape.push_back(1);
+  }
+  MACE_CHECK(padded_shape.size() == 4);
  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
-  (*image_shape)[1] = shape[0] * shape[1];
+  (*image_shape)[0] = RoundUpDiv4(padded_shape[2]) * padded_shape[3];
+  (*image_shape)[1] = padded_shape[0] * padded_shape[1];
 }

 // [Ic * H * W, (Oc + 3) / 4]
@@ -150,10 +158,10 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type) {
  if (type == WINOGRAD_FILTER) {
-    return {16, shape[0], shape[1], 1};
+    return {16, shape[0], shape[1]};
  } else if (type == IN_OUT_HEIGHT) {
    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
-    return {16, shape[3], out_width, 1};
+    return {16, shape[3], out_width};
  } else {
    LOG(FATAL) << "Mace not supported yet.";
    return std::vector<index_t>();

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -122,7 +122,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
  } else {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    if (buffer->dim_size() < 4) {
+      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
+    } else {
+      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    }
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));


--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -24,17 +24,27 @@ template <typename T>
 MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                         const Tensor *B,
                                                         Tensor *C,
+                                                         bool transpose_a,
+                                                         bool transpose_b,
                                                         StatsFuture *future) {
  MACE_UNUSED(future);
-  std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
+  MACE_CHECK(!transpose_a && !transpose_b,
+             "GPU does not support transpose matmul");
+
+  index_t rank = A->dim_size();
+  index_t height = A->dim(rank - 2);
+  index_t K = A->dim(rank - 1);
+  index_t width = B->dim(rank - 1);
+  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
+                                  std::multiplies<index_t>());
+
+  std::vector<index_t> c_shape = A->shape();
+  c_shape[rank - 2] = height;
+  c_shape[rank - 1] = width;
  std::vector<size_t> c_image_shape;
  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));

-  const index_t batch = C->dim(0);
-  const index_t height = C->dim(1);
-  const index_t width = C->dim(2);
-
  const index_t height_blocks = RoundUpDiv4(height);
  const index_t width_blocks = RoundUpDiv4(width);
  const uint32_t gws[2] = {
@@ -82,13 +92,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, *(C->opencl_image()));
  kernel_.setArg(idx++, static_cast<int>(height));
  kernel_.setArg(idx++, static_cast<int>(width));
-  kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
+  kernel_.setArg(idx++, static_cast<int>(K));
  kernel_.setArg(idx++, static_cast<int>(height_blocks));
-  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
+  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));

  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::string tuning_key = Concat("matmul_opencl_kernel", C->dim(0), C->dim(1),
-                                  C->dim(2), C->dim(3));
+  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -74,7 +74,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};

  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    output_shape = {16, input_tensor->dim(3), out_width, 1};
+    output_shape = {16, input_tensor->dim(3), out_width};
    std::vector<size_t> image_shape;
    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
@@ -104,7 +104,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
  std::string tuning_key = Concat("winograd_transform_kernel",
                                  output_tensor->dim(0), output_tensor->dim(1),
-                                  output_tensor->dim(2), output_tensor->dim(3));
+                                  output_tensor->dim(2));
  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {

--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -25,24 +25,37 @@ template <DeviceType D, class T>
 class MatMulOp : public Operator<D, T> {
 public:
  MatMulOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, ws),
+        transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
+        transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
+  }

  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *A = this->Input(0);
-    const Tensor *B = this->Input(1);
-    Tensor *C = this->Output(0);
-    MACE_CHECK(A->dim_size() == 4 && 4 == B->dim_size())
-        << "The dimension of A and B should be 4";
-    MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size";
-    MACE_CHECK(A->dim(2) == B->dim(1))
-        << "the number of A's column " << A->dim(2)
-        << " must be equal to B's row " << B->dim(1);
-
-    return functor_(A, B, C, future);
+    const Tensor *A = this->Input(INPUT_A);
+    const Tensor *B = this->Input(INPUT_B);
+    Tensor *C = this->Output(OUTPUT);
+    MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2,
+               "rank(A) should be equal to rank(B), rank should be greater "
+               "than or equal to 2");
+    index_t rank = A->dim_size();
+    for (index_t i = 0; i < rank - 2; ++i) {
+      MACE_CHECK(A->dim(i) == B->dim(i), "batch dimensions are not equal");
+    }
+    index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
+    index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);
+    MACE_CHECK(ak == bk, "the number of A's column ", ak,
+               " must be equal to B's row ", bk);
+
+    return functor_(A, B, C, transpose_a_, transpose_b_, future);
  }

 private:
+  MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+
  kernels::MatMulFunctor<D, T> functor_;
+  bool transpose_a_;
+  bool transpose_b_;
 };

 }  // namespace ops

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -31,8 +31,8 @@ void MatMulBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("A", {batch, height, channels, 1});
-  net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
+  net.AddRandomInput<D, float>("A", {batch, height, channels});
+  net.AddRandomInput<D, float>("B", {batch, channels, out_width});

  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
@@ -65,6 +65,41 @@ void MatMulBenchmark(
  }
  net.Sync();
 }
+
+template <DeviceType D, typename T>
+void MatMulTransposeBenchmark(
+    int iters, int batch, int height, int channels, int out_width) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, float>("A", {batch, height, channels});
+  net.AddRandomInput<D, float>("B", {batch, out_width, channels});
+
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("MatMul", "MatMulBM")
+        .Input("A")
+        .Input("B")
+        .AddIntArg("transpose_b", 1)
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
 }  // namespace

 #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                         \
@@ -83,6 +118,20 @@ void MatMulBenchmark(
  MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU);    \
  MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);

+#define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE)               \
+  static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(    \
+      int iters) {                                                             \
+    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
+    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
+    MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                 \
+  }                                                                            \
+  MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
+
+#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W)                   \
+  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
+
 MACE_BM_MATMUL(16, 32, 128, 49);
 MACE_BM_MATMUL(16, 32, 128, 961);
 MACE_BM_MATMUL(16, 32, 128, 3969);
@@ -90,6 +139,13 @@ MACE_BM_MATMUL(16, 128, 128, 49);
 MACE_BM_MATMUL(16, 128, 128, 961);
 MACE_BM_MATMUL(16, 128, 128, 3969);

+MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 49);
+MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 961);
+MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 3969);
+MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
+MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961);
+MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -72,46 +72,46 @@ void Simple(const std::vector<index_t> &A_shape,
 }  // namespace

 TEST_F(MatMulOpTest, SimpleCPU) {
-  Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
-                          {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
+  Simple<DeviceType::CPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
  Simple<DeviceType::CPU>(
-      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5, 1}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
-                     650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
-                     1310, 1400, 1315, 1430, 1545, 1660, 1775});
+      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
+                  650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
+                  1310, 1400, 1315, 1430, 1545, 1660, 1775});
 }

 TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
-  Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
+  Simple<DeviceType::CPU>({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
+                          {2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
+                          {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
 }

 TEST_F(MatMulOpTest, SimpleOPENCL) {
-  Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
-                          {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
+  Simple<DeviceType::GPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
  Simple<DeviceType::GPU>(
-      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5, 1}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
-                     650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
-                     1310, 1400, 1315, 1430, 1545, 1660, 1775});
+      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
+                  650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
+                  1310, 1400, 1315, 1430, 1545, 1660, 1775});
 }

 TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
-  Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
+  Simple<DeviceType::CPU>({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
+                          {2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
+                          {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
 }

 namespace {
 template <typename T>
-void Complex(const index_t batch,
+void Complex(const std::vector<index_t> &batch,
             const index_t height,
             const index_t channels,
             const index_t out_width) {
@@ -119,23 +119,14 @@ void Complex(const index_t batch,

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("A")
-      .Input("B")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<DeviceType::GPU, float>("A", {batch, height, channels, 1});
-  net.AddRandomInput<DeviceType::GPU, float>("B",
-                                             {batch, channels, out_width, 1});
-
-  // run cpu
-  net.RunOp();
-
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1,
+                                        std::multiplies<index_t>());
+  net.AddRandomInput<DeviceType::GPU, float>("A",
+                                             {batch_count, height, channels});
+  net.AddRandomInput<DeviceType::GPU, float>(
+      "B", {batch_count, channels, out_width});

  // Run on opencl
  BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
@@ -150,11 +141,40 @@ void Complex(const index_t batch,
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-  // Run on opencl
  net.RunOp(DeviceType::GPU);

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                        kernels::BufferType::IN_OUT_HEIGHT);
+
+  // run cpu
+  std::vector<index_t> shape_a = batch;
+  shape_a.push_back(height);
+  shape_a.push_back(channels);
+  std::vector<index_t> shape_b = batch;
+  shape_b.push_back(channels);
+  shape_b.push_back(out_width);
+  std::vector<index_t> expected_output_shape = batch;
+  expected_output_shape.push_back(height);
+  expected_output_shape.push_back(out_width);
+
+  net.GetTensor("A")->Reshape(shape_a);
+  net.GetTensor("B")->Reshape(shape_b);
+
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("A")
+      .Input("B")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  // Check
+  EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
+
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  expected.Reshape({batch_count, height, out_width});
+
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-1);
@@ -166,28 +186,36 @@ void Complex(const index_t batch,
 }  // namespace

 TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
-  Complex<float>(1, 64, 128, 32);
-  Complex<float>(1, 64, 32, 128);
+  Complex<float>({1}, 64, 128, 32);
+  Complex<float>({1}, 64, 32, 128);
+  Complex<float>({2, 3}, 64, 32, 128);
 }
 TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) {
-  Complex<float>(1, 31, 113, 61);
-  Complex<float>(1, 113, 31, 73);
+  Complex<float>({1}, 31, 113, 61);
+  Complex<float>({1}, 113, 31, 73);
+  Complex<float>({2, 3}, 113, 31, 73);
 }
 TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) {
-  Complex<float>(2, 3, 3, 3);
-  Complex<float>(16, 31, 61, 67);
-  Complex<float>(31, 31, 61, 67);
+  Complex<float>({2}, 3, 3, 3);
+  Complex<float>({16}, 31, 61, 67);
+  Complex<float>({31}, 31, 61, 67);
+  Complex<float>({2, 3}, 31, 61, 67);
 }
 TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) {
-  Complex<half>(1, 64, 128, 32);
-  Complex<half>(1, 64, 32, 128);
+  Complex<half>({1}, 64, 128, 32);
+  Complex<half>({1}, 64, 32, 128);
+  Complex<half>({2, 3}, 64, 32, 128);
 }
 TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
-  Complex<half>(2, 31, 113, 61);
-  Complex<half>(16, 32, 64, 64);
-  Complex<half>(31, 31, 61, 67);
+  Complex<half>({2}, 31, 113, 61);
+  Complex<half>({16}, 32, 64, 64);
+  Complex<half>({31}, 31, 61, 67);
+  Complex<half>({2, 3}, 31, 61, 67);
 }

+// TODO(liyin): test transpose after implementing gpu runtime
+// now transpose test is in kernels_test
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -518,7 +518,7 @@ class Transformer(base_converter.ConverterInterface):
                    wt_output_width = batch * (
                        (out_height + 1) / 2) * ((out_width + 1) / 2)
                    wt_output_shape.dims.extend(
-                        [16, in_channels, wt_output_width, 1])
+                        [16, in_channels, wt_output_width])

                    if ConverterUtil.get_arg(op,
                                             MaceKeyword.mace_padding_str) \
@@ -543,7 +543,7 @@ class Transformer(base_converter.ConverterInterface):
                    matmul_op.output.extend([matmul_op.name])
                    matmul_output_shape = matmul_op.output_shape.add()
                    matmul_output_shape.dims.extend(
-                        [16, out_channels, wt_output_width, 1])
+                        [16, out_channels, wt_output_width])

                    arg = matmul_op.arg.add()
                    arg.name = MaceKeyword.mace_winograd_filter_transformed

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -167,7 +167,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
    def get_op_mem_block(self, op_type, output_shape):
        mem_block = [0, 0]
        if op_type == 'WinogradTransform' or op_type == 'MatMul':
-            mem_block[0] = output_shape[2] * output_shape[3]
+            mem_block[0] = output_shape[2]
            mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
        else:
            mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)