Refactor winograd

c764ba25 · liyin · 3d88cf68 · c764ba25 · c764ba25 · c764ba25
16 changed file
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -434,16 +434,11 @@ class BufferSlice : public BufferBase {
  }

  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
-    MACE_UNUSED(offset);
-    MACE_UNUSED(length);
-    MACE_UNUSED(pitch);
-    MACE_NOT_IMPLEMENTED;
-    return nullptr;
+    return buffer_->Map(offset_ + offset, length, pitch);
  }

  void UnMap(void *mapped_ptr) const {
-    MACE_UNUSED(mapped_ptr);
-    MACE_NOT_IMPLEMENTED;
+    buffer_->UnMap(mapped_ptr);
  }

  void Map(std::vector<size_t> *pitch) {

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -304,10 +304,14 @@ class Tensor {
    if (buffer_ != nullptr) {
      MACE_CHECK(!has_opencl_image(),
                 name_, ": Cannot resize image, use ResizeImage.");
-      if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+      const index_t apply_size = raw_size()
+          + ((buffer_ != &buffer_slice_) ? MACE_EXTRA_BUFFER_PAD_SIZE : 0);
+      if (apply_size > buffer_->size()) {
        LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
-                     << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
-        return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
+                     << " to " << apply_size;
+        MACE_CHECK(buffer_ != &buffer_slice_,
+                   ": Cannot resize tensor with buffer slice");
+        return buffer_->Resize(apply_size);
      }
      return MaceStatus::MACE_SUCCESS;
    } else {

--- a/mace/ops/arm/conv_2d_neon.h
+++ b/mace/ops/arm/conv_2d_neon.h
@@ -16,22 +16,10 @@
 #define MACE_OPS_ARM_CONV_2D_NEON_H_

 #include "mace/core/types.h"
-#include "mace/ops/sgemm.h"

 namespace mace {
 namespace ops {

-void Conv2dNeonK1x1S1(const float *input,
-                      const float *filter,
-                      const index_t batch,
-                      const index_t height,
-                      const index_t width,
-                      const index_t in_channels,
-                      const index_t out_channels,
-                      float *output,
-                      SGemm *sgemm,
-                      ScratchBuffer *scratch_buffer);
-
 void Conv2dNeonK3x3S1(const float *input,
                      const float *filter,
                      const index_t *in_shape,

--- a/mace/ops/arm/conv_2d_neon_1x1.cc
+++ b/mace/ops/arm/conv_2d_neon_1x1.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Conv2dNeonK1x1S1(const float *input,
-                      const float *filter,
-                      const index_t batch,
-                      const index_t height,
-                      const index_t width,
-                      const index_t in_channels,
-                      const index_t out_channels,
-                      float *output,
-                      SGemm *sgemm,
-                      ScratchBuffer *scratch_buffer) {
-  for (index_t b = 0; b < batch; ++b) {
-    sgemm->Run(filter,
-               input + b * in_channels * height * width,
-               1,
-               out_channels,
-               in_channels,
-               in_channels,
-               height * width,
-               false,
-               false,
-               true,
-               false,
-               output + b * out_channels * height * width,
-               scratch_buffer);
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_winograd.h
+++ b/mace/ops/arm/conv_winograd.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_
-#define MACE_OPS_ARM_CONV_WINOGRAD_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "mace/core/types.h"
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-namespace ops {
-
-void TransformFilter4x4(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-
-void TransformFilter8x8(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-
-void WinogradConv3x3s1(const float *input,
-                       const float *filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-
-void WinogradConv3x3s1(const float *input,
-                       const float *transformed_filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *transformed_input,
-                       float *transformed_output,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-
-void ConvRef3x3s1(const float *input,
-                  const float *filter,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_channels,
-                  float *output);
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_CONV_WINOGRAD_H_
--- a/mace/ops/arm/conv_winograd_test.cc
+++ b/mace/ops/arm/conv_winograd_test.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <random>
-
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/ops/arm/conv_winograd.h"
-
-namespace mace {
-namespace ops {
-
-TEST(ConvWinogradTest, winograd) {
-  index_t batch = 1;
-  index_t in_height = 32;
-  index_t in_width = 32;
-  index_t in_channels = 64;
-  index_t out_channels = 128;
-
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-  index_t input_size = batch * in_channels * in_height * in_width;
-  index_t filter_size = 3 * 3 * in_channels * out_channels;
-  index_t output_size = batch * out_channels * out_height * out_width;
-
-  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
-
-  input.Resize({batch, in_channels, in_height, in_width});
-  filter.Resize({out_channels, in_channels, 3, 3});
-  output.Resize({batch, out_channels, out_height, out_width});
-  output_ref.Resize({batch, out_channels, out_height, out_width});
-
-  float *input_data = input.mutable_data<float>();
-  float *filter_data = filter.mutable_data<float>();
-  float *output_data = output.mutable_data<float>();
-  float *output_data_ref = output.mutable_data<float>();
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-  std::generate(input_data, input_data + input_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-  std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-
-  ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
-                        in_channels, out_channels, output_data_ref);
-
-  SGemm sgemm;
-  ops::WinogradConv3x3s1(input_data, filter_data, batch, in_height,
-                             in_width, in_channels, out_channels, 6,
-                             output_data, &sgemm, nullptr);
-
-  // test
-  for (index_t i = 0; i < output_size; ++i) {
-    EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i;
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
+
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK3x3Winograd : public Conv2dBase {
+ public:
+  Conv2dK3x3Winograd(int pad_top, int pad_bottom, int pad_left, int pad_right)
+      : gemm_(),
+        pad_top_(pad_top),
+        pad_bottom_(pad_bottom),
+        pad_left_(pad_left),
+        pad_right_(pad_right),
+        transformed_filter_(nullptr),
+        out_tile_size_(0) {}
+
+  virtual ~Conv2dK3x3Winograd() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+
+ private:
+  void UnPackOutput(const Tensor &padded_output,
+                    Tensor *output);
+
+  void TransformFilter4x4(const float *filter,
+                          const index_t in_channels,
+                          const index_t out_channels,
+                          float *output);
+
+  void TransformFilter8x8(const float *filter,
+                          const index_t in_channels,
+                          const index_t out_channels,
+                          float *output);
+
+  void TransformInput4x4(const float *input,
+                         const index_t batch,
+                         const index_t in_height,
+                         const index_t in_width,
+                         const index_t in_channels,
+                         const index_t tile_count,
+                         float *output);
+
+  void TransformInput8x8(const float *input,
+                         const index_t batch,
+                         const index_t in_height,
+                         const index_t in_width,
+                         const index_t in_channels,
+                         const index_t tile_count,
+                         float *output);
+
+  void TransformOutput4x4(const float *input,
+                          index_t batch,
+                          index_t out_height,
+                          index_t out_width,
+                          index_t out_channels,
+                          index_t tile_count,
+                          float *output);
+
+  void TransformOutput8x8(const float *input,
+                          index_t batch,
+                          index_t out_height,
+                          index_t out_width,
+                          index_t out_channels,
+                          index_t tile_count,
+                          float *output);
+
+  Gemm gemm_;
+  int pad_top_;
+  int pad_bottom_;
+  int pad_left_;
+  int pad_right_;
+  std::unique_ptr<Tensor> transformed_filter_;
+  index_t out_tile_size_;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
 #include <algorithm>
@@ -28,7 +28,6 @@
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/ops/arm/conv_winograd.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/utils/memory.h"
@@ -37,6 +36,7 @@
 #ifdef MACE_ENABLE_NEON
 #include "mace/ops/arm/fp32/conv_2d.h"
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
+#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
 #else
 #include "mace/ops/ref/conv_2d.h"
 #endif  // MACE_ENABLE_NEON
@@ -55,21 +55,20 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class Conv2dOp;

-template <>
+template<>
 class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
+                                                   "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)),
-        is_filter_transformed_(false),
+            "leakyrelu_coefficient", 0.0f)),
        conv2d_delegator_(nullptr) {}

  MaceStatus Run(OpContext *context) override {
@@ -127,12 +126,26 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
    index_t filter_h = filter->dim(2);
    index_t filter_w = filter->dim(3);

+    int pad_top = paddings[0] >> 1;
+    int pad_bottom = paddings[0] - pad_top;
+    int pad_left = paddings[1] >> 1;
+    int pad_right = paddings[1] - pad_left;
+
    if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
        && dilation_h == 1 && dilation_w == 1) {
      if (conv2d_delegator_.get() == nullptr) {
        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>();
      }
      conv2d_delegator_->Compute(context, input, filter, output);
+    } else if (filter_h == 3 && filter_w == 3
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1
+        && dilation_w == 1
+        && input_channels >= 8 && channels >= 8) {
+      if (conv2d_delegator_.get() == nullptr) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
+            pad_top, pad_bottom, pad_left, pad_right);
+      }
+      conv2d_delegator_->Compute(context, input, filter, output);
    } else {
      // TODO(liyin): the code below needs to be refactored.
      // delegate to each of kernels instead of ruling them all
@@ -157,11 +170,6 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {

      std::function<void(const float *input, float *output)> conv_func;

-      bool
-          use_winograd = filter_h == 3 && filter_w == 3
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1
-          && input_channels >= 8 && channels >= 8;
      bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
          && stride_h == 1 && stride_w == 1 && dilation_h == 1
          && dilation_w == 1;
@@ -193,122 +201,58 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
          && stride_h == 1 && stride_w == 1 && dilation_h == 1
          && dilation_w == 1;

-      std::vector<index_t> transformed_input_shape;
-      std::vector<index_t> transformed_output_shape;
-      std::vector<index_t> transformed_filter_shape;
-
-      // When size of input feature map is bigger than 16x16,
-      // set winograd out tile size to 6 to get higher performance.
-      index_t winograd_out_tile_size = 2;
-      if (input_height > 16 && input_width > 16) {
-        winograd_out_tile_size = 6;
-      }
-
-      if (use_winograd) {
-        extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
-        extra_input_height =
-            std::max(padded_input_height, extra_output_height + 2);
-        extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
-        extra_input_width =
-            std::max(padded_input_width, extra_output_width + 2);
-        if (extra_input_height != padded_input_height) {
-          pad_bottom += (extra_input_height - padded_input_height);
-        }
-        if (extra_input_width != padded_input_width) {
-          pad_right += (extra_input_width - padded_input_width);
-        }
-
-        index_t
-            tile_height_count = extra_output_height / winograd_out_tile_size;
-        index_t tile_width_count = extra_output_width / winograd_out_tile_size;
-        index_t tile_count = tile_height_count * tile_width_count;
-        index_t in_tile_area =
-            (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
-
-        transformed_input_shape.insert(transformed_input_shape.end(),
-                                       {in_tile_area, batch, input_channels,
-                                        tile_count});
-        transformed_output_shape.insert(transformed_output_shape.end(),
-                                        {in_tile_area, batch, channels,
-                                         tile_count});
-        transformed_filter_shape.insert(transformed_filter_shape.end(),
-                                        {in_tile_area, channels,
-                                         input_channels});
+      index_t tile_h, tile_w;
+      if (use_neon_3x3_s1) {
+        tile_h = 2;
+        tile_w = 4;
+      } else if (use_neon_7x1_s1 || use_neon_15x1_s1) {
+        tile_h = 4;
+        tile_w = 1;
      } else {
-        index_t tile_h, tile_w;
-        if (use_neon_3x3_s1) {
-          tile_h = 2;
-          tile_w = 4;
-        } else if (use_neon_7x1_s1 || use_neon_15x1_s1) {
-          tile_h = 4;
-          tile_w = 1;
-        } else {
-          tile_h = 1;
-          tile_w = 4;
-        }
-        extra_output_height = RoundUp<index_t>(height, tile_h);
-        extra_input_height =
-            std::max(padded_input_height, (extra_output_height - 1) * stride_h
-                + (filter_h - 1) * dilation_h + 1);
-        extra_output_width = RoundUp<index_t>(width, tile_w);
-        extra_input_width =
-            std::max(padded_input_width, (extra_output_width - 1) * stride_w
-                + (filter_w - 1) * dilation_w + 1);
-        if (extra_input_height != padded_input_height) {
-          pad_bottom += (extra_input_height - padded_input_height);
-        }
-        if (extra_input_width != padded_input_width) {
-          pad_right += (extra_input_width - padded_input_width);
-        }
+        tile_h = 1;
+        tile_w = 4;
+      }
+      extra_output_height = RoundUp<index_t>(height, tile_h);
+      extra_input_height =
+          std::max(padded_input_height, (extra_output_height - 1) * stride_h
+              + (filter_h - 1) * dilation_h + 1);
+      extra_output_width = RoundUp<index_t>(width, tile_w);
+      extra_input_width =
+          std::max(padded_input_width, (extra_output_width - 1) * stride_w
+              + (filter_w - 1) * dilation_w + 1);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
      }

      // decide scratch size before allocate it
      index_t total_scratch_size = 0;
-      index_t transformed_input_size = 0;
-      index_t transformed_output_size = 0;
      index_t padded_input_size = 0;
      index_t padded_output_size = 0;
-      if (use_winograd) {
-        transformed_input_size =
-            std::accumulate(transformed_input_shape.begin(),
-                            transformed_input_shape.end(),
-                            1,
-                            std::multiplies<index_t>()) * sizeof(float);
-        transformed_output_size =
-            std::accumulate(transformed_output_shape.begin(),
-                            transformed_output_shape.end(),
-                            1,
-                            std::multiplies<index_t>()) * sizeof(float);
-        total_scratch_size += transformed_input_size + transformed_output_size;
-      }
+
      if (extra_input_height != input_height
          || extra_input_width != input_width) {
        padded_input_size =
-            batch * input_channels * (input_height + pad_top + pad_bottom)
-                * (input_width + pad_left + pad_right) * sizeof(float) +
-                MACE_EXTRA_BUFFER_PAD_SIZE;
+            PadAlignSize(
+                batch * input_channels * (input_height + pad_top + pad_bottom)
+                    * (input_width + pad_left + pad_right) * sizeof(float) +
+                    MACE_EXTRA_BUFFER_PAD_SIZE);
        total_scratch_size += padded_input_size;
      }
      if (extra_output_height != height || extra_output_width != width) {
        padded_output_size =
-            batch * channels * extra_output_height * extra_output_width
-                * sizeof(float);
+            PadAlignSize(
+                batch * channels * extra_output_height * extra_output_width
+                    * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE);
        total_scratch_size += padded_output_size;
      }

-      if (use_winograd) {
-        total_scratch_size += transformed_input_size + transformed_output_size;
-      }
-
      // Init scratch buffer
      ScratchBuffer *scratch = context->device()->scratch_buffer();
      scratch->Rewind();
      scratch->GrowSize(total_scratch_size);
-      Tensor
-          transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
-      Tensor
-          transformed_output
-          (scratch->Scratch(transformed_output_size), DT_FLOAT);
      Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
      Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
      const index_t extra_input_shape[4] =
@@ -320,56 +264,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
      MACE_UNUSED(extra_input_shape);
      MACE_UNUSED(extra_output_shape);

-      Tensor transformed_filter;
-
      // decide which convolution function to call
-      if (use_winograd) {
-        transformed_input.Reshape(transformed_input_shape);
-        transformed_output.Reshape(transformed_output_shape);
-        const float *transformed_filter_data = nullptr;
-        // filter only needs to be transformed once, set transformed_filter_data
-        // to null after the first run.
-        if (!is_filter_transformed_) {
-          transformed_filter.Resize(transformed_filter_shape);
-          switch (winograd_out_tile_size) {
-            case 2:
-              TransformFilter4x4(filter_data,
-                                 filter_shape[1],
-                                 filter_shape[0],
-                                 transformed_filter.mutable_data<float>());
-              break;
-            case 6:
-              TransformFilter8x8(filter_data,
-                                 filter_shape[1],
-                                 filter_shape[0],
-                                 transformed_filter.mutable_data<float>());
-              break;
-            default:MACE_NOT_IMPLEMENTED;
-          }
-          transformed_filter_data = transformed_filter.data<float>();
-          is_filter_transformed_ = true;
-        }
-
-        float *transformed_input_data = transformed_input.mutable_data<float>();
-        float
-            *transformed_output_data = transformed_output.mutable_data<float>();
-
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          WinogradConv3x3s1(pad_input,
-                            transformed_filter_data,
-                            batch,
-                            extra_input_height,
-                            extra_input_width,
-                            input_channels,
-                            channels,
-                            winograd_out_tile_size,
-                            transformed_input_data,
-                            transformed_output_data,
-                            pad_output,
-                            &sgemm_,
-                            scratch);
-        };
-      } else if (use_neon_3x3_s1) {
+      if (use_neon_3x3_s1) {
        conv_func = [=](const float *pad_input, float *pad_output) {
          Conv2dNeonK3x3S1(pad_input,
                           filter_data,
@@ -732,8 +628,6 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  const ActivationType activation_;
  const float relux_max_limit_;
  const float leakyrelu_coefficient_;
-  bool is_filter_transformed_;
-  SGemm sgemm_;
 #ifdef MACE_ENABLE_NEON
  std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
 #else
@@ -745,7 +639,6 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };

-
 #ifdef MACE_ENABLE_QUANTIZE
 template <>
 class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
@@ -1052,7 +945,6 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterConv2D(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                   DeviceType::CPU, float);

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -37,7 +37,7 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DepthwiseDeconv2dOp;

 template<>
@@ -92,10 +92,11 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
    const index_t pad_top = out_paddings[1] / 2;

    index_t padded_out_size =
-        std::accumulate(padded_out_shape.begin(),
-                        padded_out_shape.end(),
-                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
+        PadAlignSize(std::accumulate(padded_out_shape.begin(),
+                                     padded_out_shape.end(),
+                                     1,
+                                     std::multiplies<index_t>())
+                         * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE);
    ScratchBuffer *scratch = context->device()->scratch_buffer();
    scratch->Rewind();
    scratch->GrowSize(padded_out_size);
@@ -253,7 +254,6 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
                padded_out_shape.data(),
                out_data);

-
    if (!no_pad) {
      CropPadOut<float>(out_data,
                        padded_out_shape.data(),
@@ -384,7 +384,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
              const index_t out_offset =
                  i * strides[0] * out_width + j * strides[1];
              for (int q = 0; q < in_channels_g; ++q) {
-                const  index_t in_base =
+                const index_t in_base =
                    ((b * group + g) * in_channels_g + q) * in_img_size;
                const index_t in_offset =
                    in_base + i * in_width + j;

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -21,7 +21,6 @@

 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/sgemm.h"
 #include "mace/utils/utils.h"

 #ifdef MACE_ENABLE_NEON

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -21,7 +21,6 @@
 #include "public/gemmlowp.h"
 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/sgemm.h"
 #include "mace/ops/ops_test_util.h"

 namespace gemmlowp {
@@ -94,32 +93,6 @@ namespace test {

 namespace {

-// Matmul with (m, k) x (k, n)
-void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-
-  ops::SGemmMatrixMap<const float>
-      matrix_lhs(1, m, k, SGemmRowMajor, lhs.data(),
-                 true);
-  ops::SGemmMatrixMap<const float>
-      matrix_rhs(1, k, n, SGemmRowMajor, rhs.data(),
-                 true);
-  ops::SGemmMatrixMap<float>
-      matrix_result(1, m, n, SGemmRowMajor, result.data());
-
-  ops::SGemm sgemm;
-
-  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-  }
-}
-
 void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
  mace::testing::StopTiming();
  Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
@@ -223,7 +196,6 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)

 #define MACE_BM_MATMUL(M, K, N)                          \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);

--- a/mace/ops/sgemm.cc
+++ b/mace/ops/sgemm.cc
--- a/mace/ops/sgemm.h
+++ b/mace/ops/sgemm.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This implementation is deprecated. use mace/ops/arm/fp32/gemm.h instead.
-
-#ifndef MACE_OPS_SGEMM_H_
-#define MACE_OPS_SGEMM_H_
-
-#include <memory>
-#include <utility>
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/core/types.h"
-#include "mace/core/allocator.h"
-#include "mace/core/tensor.h"
-
-namespace mace {
-namespace ops {
-
-enum Major {
-  SGemmRowMajor,
-  SGemmColMajor
-};
-
-template<typename T>
-class SGemmMatrixMap {
- public:
-  SGemmMatrixMap() {}
-
-  SGemmMatrixMap(const index_t batch,
-            const index_t row,
-            const index_t col,
-            const Major major,
-            T *data,
-            const bool is_const = false) :
-      batch_(batch),
-      row_(row),
-      col_(col),
-      stride_(major == SGemmRowMajor ? col : row),
-      major_(major),
-      data_(data),
-      is_const_(is_const) {}
-
-  SGemmMatrixMap transpose() const {
-    Major transpose_major =
-        major_ == SGemmRowMajor ? SGemmColMajor : SGemmRowMajor;
-    return SGemmMatrixMap(batch_,
-                          col_,
-                          row_,
-                          transpose_major,
-                          data_,
-                          is_const_);
-  }
-
-  index_t batch() const {
-    return batch_;
-  }
-
-  index_t row() const {
-    return row_;
-  }
-
-  index_t col() const {
-    return col_;
-  }
-
-  index_t stride() const {
-    return stride_;
-  }
-
-  Major map_major() const {
-    return major_;
-  }
-
-  T *data() const {
-    return data_;
-  }
-
-  T *batch_data(index_t batch) const {
-    return data_ + batch * row_ * col_;
-  }
-
-  index_t size() const {
-    return batch_ * row_ * col_;
-  }
-
-  bool is_const() const {
-    return is_const_;
-  }
-
- private:
-  index_t batch_;
-  index_t row_;
-  index_t col_;
-  index_t stride_;
-  Major major_;
-  T *data_;
-  bool is_const_;
-};
-
-typedef Major PackOrder;
-typedef Tensor PackedBlock;
-
-class SGemm {
- public:
-  SGemm()
-      : packed_lhs_(nullptr),
-        packed_rhs_(nullptr),
-        packed_(false) {}
-
-  void operator()(const SGemmMatrixMap<const float> &lhs,
-                  const SGemmMatrixMap<const float> &rhs,
-                  SGemmMatrixMap<float> *result,
-                  ScratchBuffer *scratch_buffer = nullptr);
-
-  void Run(const float *A,
-           const float *B,
-           const index_t batch,
-           const index_t height_a,
-           const index_t width_a,
-           const index_t height_b,
-           const index_t width_b,
-           const bool transpose_a,
-           const bool transpose_b,
-           const bool is_a_weight,
-           const bool is_b_weight,
-           float *C,
-           ScratchBuffer *scratch_buffer = nullptr);
-
-  void PackLhs(const SGemmMatrixMap<const float> &lhs,
-               PackedBlock *packed_block);
-
-  void PackRhs(const SGemmMatrixMap<const float> &rhs,
-               PackedBlock *packed_block);
-
-  void UnPack(const PackedBlock &packed_result,
-              SGemmMatrixMap<float> *matrix_map);
-
- private:
-  void Pack(const SGemmMatrixMap<const float> &src,
-            const PackOrder order,
-            PackedBlock *packed_block);
-
-  void PackPerBatch(const SGemmMatrixMap<const float> &src,
-                    const PackOrder order,
-                    const index_t batch_index,
-                    float *packed_data);
-
-  void UnPackPerBatch(const float *packed_data,
-                      const index_t batch_index,
-                      SGemmMatrixMap<float> *matrix_map);
-
-  void RunInternal(const PackedBlock &lhs,
-                   const PackedBlock &rhs,
-                   const index_t batch,
-                   const index_t height,
-                   const index_t depth,
-                   const index_t width,
-                   PackedBlock *result);
-
-  void RunPerBatch(const float *lhs,
-                   const float *rhs,
-                   const index_t height,
-                   const index_t depth,
-                   const index_t width,
-                   float *result);
-
-  std::unique_ptr<Tensor> packed_lhs_;
-  std::unique_ptr<Tensor> packed_rhs_;
-  std::unique_ptr<Tensor> packed_result_;
-
-  bool packed_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SGEMM_H_
--- a/mace/ops/sgemm_pack_test.cc
+++ b/mace/ops/sgemm_pack_test.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <random>
-#include <vector>
-
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-namespace {
-void TestPack(const std::vector<float> &data,
-              const std::vector<float> &expected_data,
-              const index_t height,
-              const index_t width,
-              Major src_order,
-              PackOrder pack_order) {
-  SGemm sg;
-  SGemmMatrixMap<const float>
-      src_matrix(1, height, width, src_order, data.data());
-  PackedBlock packed;
-  packed.Resize({height, width});
-  if (pack_order == PackOrder::SGemmColMajor) {
-    sg.PackLhs(src_matrix, &packed);
-  } else {
-    sg.PackRhs(src_matrix, &packed);
-  }
-
-  auto packed_data = packed.data<float>();
-  for (index_t i = 0; i < packed.size(); ++i) {
-    EXPECT_EQ(expected_data[i], packed_data[i]);
-  }
-}
-
-void TestUnPack(const index_t height,
-                const index_t width,
-                Major src_order,
-                PackOrder pack_order) {
-  static auto seed = static_cast<unsigned int>(time(nullptr));
-  const index_t matrix_size = height * width;
-  std::vector<float> data(matrix_size);
-  for (int i = 0; i < matrix_size; ++i) {
-    data[i] = rand_r(&seed);
-  }
-
-  SGemmMatrixMap<const float>
-      src_matrix(1, height, width, src_order, data.data());
-  PackedBlock packed;
-  packed.Resize({height, width});
-  SGemm sg;
-  if (pack_order == PackOrder::SGemmColMajor) {
-    sg.PackLhs(src_matrix, &packed);
-  } else {
-    sg.PackRhs(src_matrix, &packed);
-  }
-
-  std::vector<float> unpacked(matrix_size);
-  SGemmMatrixMap<float>
-      unpacked_matrix(1, height, width, src_order, unpacked.data());
-  sg.UnPack(packed, &unpacked_matrix);
-  auto unpacked_data = unpacked.data();
-  for (index_t i = 0; i < packed.size(); ++i) {
-    EXPECT_EQ(data[i], unpacked_data[i]);
-  }
-}
-}  // namespace
-
-
-TEST(SGemmPackTest, Pack) {
-  std::vector<float> data =
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36};
-
-  // For no-transpose lhs
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-           3, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
-           4, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
-            20},
-           5, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#if defined(__aarch64__)
-  TestPack(data,
-           {1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11,
-            15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36},
-           9, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#endif
-#endif
-  // For transpose-needed lhs
-  TestPack(data,
-           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
-           3, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-           4, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-  TestPack(data,
-           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
-            20},
-           5, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#if defined(__aarch64__)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
-            22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36},
-           9, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#endif
-#endif
-  // For no-transpose rhs
-  TestPack(data,
-           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
-           4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-           4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestPack(data,
-           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
-            20},
-           4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-#endif
-  // For transpose-needed rhs
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-           4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
-           4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
-            20},
-           4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-#endif
-}
-
-TEST(SGemmPackTest, UnPack) {
-  TestUnPack(4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 100, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 100, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
-
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -124,12 +124,11 @@ TEST_F(MaceAPITest, MultipleInputOutput) {
 }

 TEST_F(MaceAPITest, VariableInputShape) {
-  // TODO(liyin): there is a bug of cpu convolution
-//  MaceRun<CPU, float>(1,
-//                      {1, 32, 64, 16},
-//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
-//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
-//                      {16, 16, 3, 3});
+  MaceRun<CPU, float>(1,
+                      {1, 32, 64, 16},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {16, 16, 3, 3});
  MaceRun<GPU, float>(1,
                      {1, 32, 64, 16},
                      {{1, 16, 32, 16}, {1, 32, 64, 16}},