fix memory invalid read

85258d86 · wuchenghui · c9490fd3 · 85258d86 · 85258d86 · 85258d86
5 changed file
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -28,6 +28,13 @@
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"

+#ifdef MACE_ENABLE_NEON
+// Avoid over-bound accessing memory
+#define EXTRA_BUFFER_PAD_SIZE 64
+#else
+#define EXTRA_BUFFER_PAD_SIZE 0
+#endif
+
 namespace mace {

 #define SINGLE_ARG(...) __VA_ARGS__
@@ -212,10 +219,11 @@ class Tensor {
    image_shape_.clear();
    if (buffer_ != nullptr) {
      MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
-      if (raw_size() > buffer_->size()) buffer_->Resize(raw_size());
+      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size())
+        buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
    } else {
      MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Buffer(allocator_, raw_size());
+      buffer_ = new Buffer(allocator_, raw_size() + EXTRA_BUFFER_PAD_SIZE);
    }
  }


--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -334,7 +334,7 @@ void Conv2dNeonK3x3S1(const float *input,
            float32x4_t vf00, vf01, vf02;
            vf00 = vld1q_f32(filter_ptr0);
            vf01 = vld1q_f32(filter_ptr0 + 3);
-            vf02 = vld1q_f32(filter_ptr0 + 6);
+            vf02 = vld1q_f32(filter_ptr0 + 5);

            for (index_t h = 0; h + 1 < out_height; h += 2) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
@@ -377,9 +377,9 @@ void Conv2dNeonK3x3S1(const float *input,
                vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
                vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
                vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
-                vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
-                vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
-                vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
+                vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
+                vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
+                vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);

                // outch 0, height 1
                vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
@@ -388,9 +388,9 @@ void Conv2dNeonK3x3S1(const float *input,
                vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
                vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
                vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
-                vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
-                vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
-                vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
+                vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
+                vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
+                vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);

                vst1q_f32(out_ptr0, vo00);
                vst1q_f32(out_ptr0 + out_width, vo01);
@@ -413,12 +413,12 @@ void Conv2dNeonK3x3S1(const float *input,
            }  // h
 #elif defined(MACE_ENABLE_NEON)  // arm v7
            // load filter (1 outch x 3 height x 3 width): vf_outch_height
-            float32x2_t vf01, vf23, vf45, vf67, vf89;
+            float32x2_t vf01, vf23, vf45, vf67, vf78;
            vf01 = vld1_f32(filter_ptr0);
            vf23 = vld1_f32(filter_ptr0 + 2);
            vf45 = vld1_f32(filter_ptr0 + 4);
            vf67 = vld1_f32(filter_ptr0 + 6);
-            vf89 = vld1_f32(filter_ptr0 + 8);
+            vf78 = vld1_f32(filter_ptr0 + 7);

            for (index_t h = 0; h + 1 < out_height; h += 2) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
@@ -463,7 +463,7 @@ void Conv2dNeonK3x3S1(const float *input,
                vo00 = vmlaq_lane_f32(vo00, vi12, vf45, 1);
                vo00 = vmlaq_lane_f32(vo00, vi20, vf67, 0);
                vo00 = vmlaq_lane_f32(vo00, vi21, vf67, 1);
-                vo00 = vmlaq_lane_f32(vo00, vi22, vf89, 0);
+                vo00 = vmlaq_lane_f32(vo00, vi22, vf78, 1);

                // outch 0, height 1
                vo01 = vmlaq_lane_f32(vo01, vi10, vf01, 0);
@@ -474,7 +474,7 @@ void Conv2dNeonK3x3S1(const float *input,
                vo01 = vmlaq_lane_f32(vo01, vi22, vf45, 1);
                vo01 = vmlaq_lane_f32(vo01, vi30, vf67, 0);
                vo01 = vmlaq_lane_f32(vo01, vi31, vf67, 1);
-                vo01 = vmlaq_lane_f32(vo01, vi32, vf89, 0);
+                vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);

                vst1q_f32(out_ptr0, vo00);
                vst1q_f32(out_ptr0 + out_width, vo01);
@@ -544,7 +544,7 @@ void Conv2dNeonK3x3S2(const float *input,
        float32x4_t vf00, vf01, vf02;
        vf00 = vld1q_f32(filter_ptr);
        vf01 = vld1q_f32(filter_ptr + 3);
-        vf02 = vld1q_f32(filter_ptr + 6);
+        vf02 = vld1q_f32(filter_ptr + 5);

        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w + 3 < out_width; w += 4) {
@@ -592,21 +592,21 @@ void Conv2dNeonK3x3S2(const float *input,
            vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
            vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
            vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
-            vo = vfmaq_laneq_f32(vo, vi20, vf02, 0);
-            vo = vfmaq_laneq_f32(vo, vi21, vf02, 1);
-            vo = vfmaq_laneq_f32(vo, vi22, vf02, 2);
+            vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
+            vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
+            vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);

            vst1q_f32(out_base + out_offset, vo);
          }  // w
        }  // h
 #elif defined(MACE_ENABLE_NEON)  // arm v7
        // load filter (1 outch x 3 height x 3 width): vf_outch_height
-        float32x2_t vf01, vf23, vf45, vf67, vf89;
+        float32x2_t vf01, vf23, vf45, vf67, vf78;
        vf01 = vld1_f32(filter_ptr);
        vf23 = vld1_f32(filter_ptr + 2);
        vf45 = vld1_f32(filter_ptr + 4);
        vf67 = vld1_f32(filter_ptr + 6);
-        vf89 = vld1_f32(filter_ptr + 8);
+        vf78 = vld1_f32(filter_ptr + 7);

        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w + 3 < out_width; w += 4) {
@@ -656,7 +656,7 @@ void Conv2dNeonK3x3S2(const float *input,
            vo = vmlaq_lane_f32(vo, vi12, vf45, 1);
            vo = vmlaq_lane_f32(vo, vi20, vf67, 0);
            vo = vmlaq_lane_f32(vo, vi21, vf67, 1);
-            vo = vmlaq_lane_f32(vo, vi22, vf89, 0);
+            vo = vmlaq_lane_f32(vo, vi22, vf78, 1);

            vst1q_f32(out_base + out_offset, vo);
          }  // w

--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -104,7 +104,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
      float32x4_t vf00, vf01, vf02;
      vf00 = vld1q_f32(filter_ptr);
      vf01 = vld1q_f32(filter_ptr + 3);
-      vf02 = vld1q_f32(filter_ptr + 6);
+      vf02 = vld1q_f32(filter_ptr + 5);

      for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
        // left
@@ -180,9 +180,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
          vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
          vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
          vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
-          vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
-          vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
-          vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
+          vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
+          vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
+          vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);

          // outch 0, height 1
          vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
@@ -191,9 +191,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
          vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
          vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
          vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
-          vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
-          vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
-          vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
+          vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
+          vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
+          vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
 #else
          // outch 0, height 0
          vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0);
@@ -202,9 +202,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
          vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0);
          vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1);
          vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi21, vget_low_f32(vf02), 1);
-          vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 0);
+          vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1);
+          vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0);
+          vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1);

          // outch 0, height 1
          vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0);
@@ -213,9 +213,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
          vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0);
          vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1);
          vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi31, vget_low_f32(vf02), 1);
-          vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 0);
+          vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1);
+          vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0);
+          vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
 #endif
          vst1q_f32(out_base + out_offset, vo00);
          vst1q_f32(out_base + out_offset + out_width, vo01);
@@ -344,7 +344,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
      float32x4_t vf00, vf01, vf02;
      vf00 = vld1q_f32(filter_ptr);
      vf01 = vld1q_f32(filter_ptr + 3);
-      vf02 = vld1q_f32(filter_ptr + 6);
+      vf02 = vld1q_f32(filter_ptr + 5);

      for (h = valid_h_start; h < valid_h_stop; ++h) {
        // left
@@ -409,9 +409,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
          vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
          vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
          vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
-          vo = vfmaq_laneq_f32(vo, vi20, vf02, 0);
-          vo = vfmaq_laneq_f32(vo, vi21, vf02, 1);
-          vo = vfmaq_laneq_f32(vo, vi22, vf02, 2);
+          vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
+          vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
+          vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
 #else
          // outch 0, height 0
          vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0);
@@ -420,9 +420,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
          vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0);
          vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1);
          vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0);
-          vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 0);
-          vo = vmlaq_lane_f32(vo, vi21, vget_low_f32(vf02), 1);
-          vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 0);
+          vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1);
+          vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0);
+          vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1);
 #endif
          vst1q_f32(out_base + out_offset, vo);
        }  // w

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -461,7 +461,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      || extra_input_width != input_width) {
      padded_input_size =
        batch * input_channels * (input_height + pad_top + pad_bottom)
-          * (input_width + pad_left + pad_right) * sizeof(float);
+          * (input_width + pad_left + pad_right) * sizeof(float) +
+            EXTRA_BUFFER_PAD_SIZE;
      total_scratch_size += padded_input_size;
    }
    if (extra_output_height != height || extra_output_width != width) {
@@ -482,8 +483,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

    // decide which convolution function to call
    if (use_winograd) {
-      transformed_input.Resize(transformed_input_shape);
-      transformed_output.Resize(transformed_output_shape);
+      transformed_input.Reshape(transformed_input_shape);
+      transformed_output.Reshape(transformed_output_shape);
      const float *transformed_filter_ptr;
      if (transformed_filter_.dim_size() == 0) {
        if (is_filter_transformed_) {
@@ -652,7 +653,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

    Tensor *pad_output_ptr = output;
    if (extra_output_height != height || extra_output_width != width) {
-      padded_output.Resize({batch, channels, extra_output_height,
+      padded_output.Reshape({batch, channels, extra_output_height,
                            extra_output_width});
      padded_output.Clear();
      pad_output_ptr = &padded_output;

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -114,50 +114,28 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {

 namespace {
 template<DeviceType D, typename T>
-void ComplexValidTest() {
+void ComplexValidTest(index_t batch, index_t channel, index_t height,
+                      index_t width, index_t kernel, index_t multiplier,
+                      int stride) {
  testing::internal::LogToStderr();
  // Construct graph
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<D, float>(
-    "Input", {1, 10, 10, 3},
-    {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11,
-     0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23,
-     0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35,
-     0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47,
-     0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
-     0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71,
-     0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83,
-     0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95,
-     0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07,
-     1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19,
-     1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31,
-     1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43,
-     1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55,
-     1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67,
-     1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79,
-     1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91,
-     1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03,
-     2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15,
-     2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27,
-     2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39,
-     2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51,
-     2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63,
-     2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75,
-     2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87,
-     2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99});
-  net.AddInputFromArray<D, float>(
-    "Filter", {5, 5, 3, 1},
-    {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
-     0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21,
-     0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32,
-     0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43,
-     0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54,
-     0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,
-     0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74});
-  net.AddInputFromArray<D, float>("Bias", {6},
-                                  {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+  std::vector<float> input_data(batch * height * width * channel);
+  GenerateRandomRealTypeData({batch, height, width, channel}, &input_data);
+  net.AddInputFromArray<D, float>("Input", {batch, height, width, channel},
+                                  input_data);
+  std::vector<float> filter_data(kernel * kernel * channel * multiplier);
+  GenerateRandomRealTypeData({kernel, kernel, channel, multiplier},
+                             &filter_data);
+  net.AddInputFromArray<D, float>("Filter",
+                                  {kernel, kernel, channel, multiplier},
+                                  filter_data);
+  std::vector<float> bias_data(channel * multiplier);
+  GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
+  net.AddInputFromArray<D, float>("Bias", {channel * multiplier},
+                                  bias_data);

  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input",
@@ -173,7 +151,7 @@ void ComplexValidTest() {
      .Input("FilterOIHW")
      .Input("Bias")
      .Output("OutputNCHW")
-      .AddIntsArg("strides", {2, 2})
+      .AddIntsArg("strides", {stride, stride})
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
@@ -196,7 +174,7 @@ void ComplexValidTest() {
      .Input("FilterImage")
      .Input("BiasImage")
      .Output("OutputImage")
-      .AddIntsArg("strides", {2, 2})
+      .AddIntsArg("strides", {stride, stride})
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
@@ -212,25 +190,43 @@ void ComplexValidTest() {
    MACE_NOT_IMPLEMENTED;
  }

-  // Check
+  // expect
+  index_t out_height = (height - 1) / stride + 1;
+  index_t out_width = (width - 1) / stride + 1;
+  index_t pad_top = ((out_height - 1) * stride + kernel - height) >> 1;
+  index_t pad_left = ((out_width - 1) * stride + kernel - width) >> 1;
+  index_t out_channels = channel * multiplier;
+  std::vector<T> expect(batch * out_height * out_width * out_channels);
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t h = 0; h < out_height; ++h) {
+      for (index_t w = 0; w < out_width; ++w) {
+        for (index_t m = 0; m < out_channels; ++m) {
+          index_t out_offset =
+              ((b * out_height + h) * out_width + w) * out_channels + m;
+          index_t c = m / multiplier;
+          index_t o = m % multiplier;
+          float sum = 0;
+          for (index_t kh = 0; kh < kernel; ++kh) {
+            for (index_t kw = 0; kw < kernel; ++kw) {
+              index_t ih = h * stride - pad_top + kh;
+              index_t iw = w * stride - pad_left + kw;
+              if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+                index_t in_offset =
+                    ((b * height + ih) * width + iw) * channel + c;
+                index_t filter_offset =
+                    (((kh * kernel) + kw) * channel + c) * multiplier + o;
+                sum += input_data[in_offset] * filter_data[filter_offset];
+              }
+            }
+          }
+          expect[out_offset] = sum + bias_data[m];
+        }
+      }
+    }
+  }
+
  auto expected = CreateTensor<T>(
-    {1, 5, 5, 3},
-    VectorStaticCast<T>(
-      {4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976,
-       6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988,
-       7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977,
-       8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103,
-       11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985,
-       12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036,
-       13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741,
-       17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947,
-       18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998,
-       17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046,
-       23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029,
-       23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996,
-       6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024,
-       9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968,
-       9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027}));
+      {1, out_height, out_width, out_channels}, expect);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
    ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -241,15 +237,23 @@ void ComplexValidTest() {
 }  // namespace

 TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
-  ComplexValidTest<DeviceType::CPU, float>();
+  ComplexValidTest<DeviceType::CPU, float>(1, 3, 10, 10, 5, 1, 2);
+}
+
+TEST_F(DepthwiseConv2dOpTest, ComplexCPU3x3s1) {
+  ComplexValidTest<DeviceType::CPU, float>(1, 3, 10, 10, 3, 1, 1);
+}
+
+TEST_F(DepthwiseConv2dOpTest, ComplexCPU3x3s2) {
+  ComplexValidTest<DeviceType::CPU, float>(1, 3, 10, 10, 3, 1, 2);
 }

 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
-  ComplexValidTest<DeviceType::GPU, float>();
+  ComplexValidTest<DeviceType::GPU, float>(1, 3, 10, 10, 5, 1, 2);
 }

 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
-  ComplexValidTest<DeviceType::GPU, half>();
+  ComplexValidTest<DeviceType::GPU, half>(1, 3, 10, 10, 5, 1, 2);
 }

 namespace {