fold shape_stridedslice_stack

e9d1103b · liutuo · 50666a81 · e9d1103b · e9d1103b · e9d1103b
8 changed file
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -90,20 +90,18 @@ void Deconv2dNCHW(const T *input,
 }  // namespace deconv

 struct Deconv2dFunctorBase {
-  Deconv2dFunctorBase(const int *strides,
+  Deconv2dFunctorBase(const std::vector<int> &strides,
                      const Padding &padding_type,
                      const std::vector<int> &paddings,
                      const std::vector<index_t> &output_shape,
                      const ActivationType activation,
-                      const float relux_max_limit,
-                      const bool from_caffe)
+                      const float relux_max_limit)
      : strides_(strides),
        padding_type_(padding_type),
        paddings_(paddings),
        output_shape_(output_shape),
        activation_(activation),
-        relux_max_limit_(relux_max_limit),
-        from_caffe_(from_caffe) {}
+        relux_max_limit_(relux_max_limit) {}

  static void CalcDeconvOutputSize(
      const index_t *input_shape,   // NHWC
@@ -202,31 +200,28 @@ struct Deconv2dFunctorBase {
    padding_size[1] = std::max<int>(0, p_w);
  }

-  const int *strides_;  // [stride_h, stride_w]
+  std::vector<int> strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
  std::vector<int> paddings_;
  std::vector<index_t> output_shape_;
  const ActivationType activation_;
  const float relux_max_limit_;
-  const bool from_caffe_;
 };

 template <DeviceType D, typename T>
 struct Deconv2dFunctor : Deconv2dFunctorBase {
-  Deconv2dFunctor(const int *strides,
+  Deconv2dFunctor(const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
-                  const float relux_max_limit,
-                  const bool from_caffe)
+                  const float relux_max_limit)
      : Deconv2dFunctorBase(strides,
                            padding_type,
                            paddings,
                            output_shape,
                            activation,
-                            relux_max_limit,
-                            from_caffe) {}
+                            relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,   // NCHW
                  const Tensor *filter,  // OIHW
@@ -239,13 +234,12 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

-    if (!from_caffe_) {  // tensorflow
-      std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    std::vector<index_t> output_shape(4);
+    if (paddings_.empty()) {  // tensorflow
+      paddings = std::vector<int>(2, 0);
      if (output_shape_.size() == 4) {
-        output_shape[0] = output_shape_[0];
-        output_shape[1] = output_shape_[3];
-        output_shape[2] = output_shape_[1];
-        output_shape[3] = output_shape_[2];
+        output_shape = output_shape_;
      } else {
        MACE_CHECK_NOTNULL(output_shape_tensor);
        MACE_CHECK(output_shape_tensor->size() == 4);
@@ -255,36 +249,38 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
        output_shape =
            std::vector<index_t>(output_shape_data, output_shape_data + 4);
      }
-      paddings_.clear();
-      paddings_ = std::vector<int>(2, 0);
+      const index_t t = output_shape[1];
+      output_shape[1] = output_shape[3];
+      output_shape[3] = output_shape[2];
+      output_shape[2] = t;
+
      CalcDeconvPaddingAndInputSize(
          input->shape().data(),
          filter->shape().data(),
-          strides_, padding_type_,
+          strides_.data(), padding_type_,
          output_shape.data(),
-          paddings_.data(), true);
-      MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+          paddings.data(), true);
    } else {  // caffe
-      output_shape_.clear();
-      output_shape_ = std::vector<index_t>(4, 0);
+      paddings = paddings_;
+      output_shape = std::vector<index_t>(4, 0);
      CalcDeconvOutputSize(input->shape().data(),
                           filter->shape().data(),
-                           strides_,
-                           output_shape_.data(),
-                           paddings_.data(), true);
-      MACE_RETURN_IF_ERROR(output->Resize(output_shape_));
+                           strides_.data(),
+                           output_shape.data(),
+                           paddings.data(), true);
    }
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
    index_t kernel_h = filter->dim(2);
    index_t kernel_w = filter->dim(3);
    const index_t *in_shape = input->shape().data();
-    const index_t *out_shape = output->shape().data();
    const index_t kernel_hw[2] = {kernel_h, kernel_w};

-    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
-               out_shape[1]);
+    MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
               in_shape[1]);
-    MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
+    MACE_CHECK(in_shape[0] == output_shape[0],
+               "Input/Output batch size mismatch");
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
@@ -294,15 +290,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
    auto output_data = output->mutable_data<T>();
    int padding[2];
-    padding[0] = (paddings_[0] + 1) >> 1;
-    padding[1] = (paddings_[1] + 1) >> 1;
+    padding[0] = (paddings[0] + 1) >> 1;
+    padding[1] = (paddings[1] + 1) >> 1;
    deconv::Deconv2dNCHW(input_data,
                         filter_data,
                         bias_data,
                         in_shape,
-                         out_shape,
+                         output_shape.data(),
                         kernel_hw,
-                         strides_,
+                         strides_.data(),
                         padding,
                         output_data);

@@ -319,20 +315,18 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
-  Deconv2dFunctor(const int *strides,
+  Deconv2dFunctor(const std::vector<int> &strides,
                  const Padding &padding_type,
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
-                  const float relux_max_limit,
-                  const bool from_caffe)
+                  const float relux_max_limit)
      : Deconv2dFunctorBase(strides,
                            padding_type,
                            paddings,
                            output_shape,
                            activation,
-                            relux_max_limit,
-                            from_caffe) {}
+                            relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                  const Tensor *filter,

--- a/mace/kernels/opencl/cl/deconv_2d.cl
+++ b/mace/kernels/opencl/cl/deconv_2d.cl
@@ -15,8 +15,10 @@ __kernel void deconv_2d(KERNEL_ERROR_PARAMS
                        __private const int out_height,
                        __private const int out_width,
                        __private const int out_channel,
-                        __private const int stride,
-                        __private const float stride_r,
+                        __private const int stride_h,
+                        __private const int stride_w,
+                        __private const float stride_h_r,
+                        __private const float stride_w_r,
                        __private const int align_h,
                        __private const int align_w,
                        __private const int padding_h,
@@ -53,18 +55,18 @@ __kernel void deconv_2d(KERNEL_ERROR_PARAMS
  DATA_TYPE4 out4 = 0;
 #endif

-  const int n_stride = mad(w_id, stride_r, 0);
-  const int mod_stride = w_id - mul24(n_stride, stride);
-  const int w = mad24(mul24(n_stride, 5), stride, mod_stride);
+  const int n_stride = mad(w_id, stride_w_r, 0);
+  const int mod_stride = w_id - mul24(n_stride, stride_w);
+  const int w = mad24(mul24(n_stride, 5), stride_w, mod_stride);
  const int b = hb / out_height;
  const int h = hb - mul24(b, out_height);
  if (w < out_width) {
-    int start_x = floor((float) (w + align_w) * stride_r);
-    int start_y = (h + align_h) * stride_r;
+    int start_x = floor((float) (w + align_w) * stride_w_r);
+    int start_y = (h + align_h) * stride_h_r;
    start_y = max(0, start_y);

-    int f_start_x = mad24(start_x, stride, padding_w) - w;
-    int f_start_y = mad24(start_y, stride, padding_h) - h;
+    int f_start_x = mad24(start_x, stride_w, padding_w) - w;
+    int f_start_y = mad24(start_y, stride_h, padding_h) - h;
    f_start_x = kernel_w - 1 - f_start_x;
    f_start_y = kernel_h - 1 - f_start_y;

@@ -79,10 +81,10 @@ __kernel void deconv_2d(KERNEL_ERROR_PARAMS
      f_pos_x1 = f_pos_x0 + 1;
      f_pos_x2 = f_pos_x0 + 2;
      f_pos_x3 = f_pos_x0 + 3;
-      for (int f_y = f_start_y, idx_h = start_y ; f_y >= 0; f_y -= stride, ++idx_h) {
+      for (int f_y = f_start_y, idx_h = start_y ; f_y >= 0; f_y -= stride_h, ++idx_h) {
        index_y = mad24(b, in_height, idx_h);
        in_pos.y = select(index_y, -1, idx_h < 0 || idx_h >= in_height);
-        for (int f_x = f_start_x, idx_w = start_x; f_x >= 0; f_x -= stride, ++idx_w) {
+        for (int f_x = f_start_x, idx_w = start_x; f_x >= 0; f_x -= stride_w, ++idx_w) {
          f_pos_y = mad24(f_y, kernel_w, f_x);
          f_pos_y = mad24(c, kernel_size, f_pos_y);
          weight0 = READ_IMAGET(weights, SAMPLER, (int2)(f_pos_x0, f_pos_y));
@@ -141,24 +143,24 @@ __kernel void deconv_2d(KERNEL_ERROR_PARAMS
    out_pos.x = mad24(c, out_width, ow);
    WRITE_IMAGET(output, out_pos, out0);

-    ow += stride;
+    ow += stride_w;
    if (ow >= out_width) return;
-    out_pos.x += stride;
+    out_pos.x += stride_w;
    WRITE_IMAGET(output, out_pos, out1);

-    ow += stride;
+    ow += stride_w;
    if (ow >= out_width) return;
-    out_pos.x += stride;
+    out_pos.x += stride_w;
    WRITE_IMAGET(output, out_pos, out2);

-    ow += stride;
+    ow += stride_w;
    if (ow >= out_width) return;
-    out_pos.x += stride;
+    out_pos.x += stride_w;
    WRITE_IMAGET(output, out_pos, out3);

-    ow += stride;
+    ow += stride_w;
    if (ow >= out_width) return;
-    out_pos.x += stride;
+    out_pos.x += stride_w;
    WRITE_IMAGET(output, out_pos, out4);
  }
 }
\ No newline at end of file
--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -24,7 +24,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
                          const Tensor *input,
                          const Tensor *filter,
                          const Tensor *bias,
-                          const int stride,
+                          const int *strides,
                          const int *paddings,
                          const ActivationType activation,
                          const float relux_max_limit,
@@ -42,17 +42,20 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,

  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  MACE_CHECK(stride > 0, "stride should > 0.");
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
 #define MACE_WIDTH_BLK 5
-  const index_t n_strides = (width + stride - 1) / stride;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
  const index_t width_blocks =
-      ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride;
-  const float stride_r = 1.f / static_cast<float>(stride);
+      ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
  const int padding_h = (paddings[0] + 1) >> 1;
-  const int padding_w = (paddings[0] + 1) >> 1;
+  const int padding_w = (paddings[1] + 1) >> 1;

-  const int align_h = stride - 1 - padding_h;
-  const int align_w = stride - 1 - padding_w;
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
  const int kernel_size = filter->dim(2) * filter->dim(3);

  auto runtime = OpenCLRuntime::Global();
@@ -113,8 +116,10 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
    kernel->setArg(idx++, static_cast<int32_t>(height));
    kernel->setArg(idx++, static_cast<int32_t>(width));
    kernel->setArg(idx++, static_cast<int32_t>(channels));
-    kernel->setArg(idx++, static_cast<int32_t>(stride));
-    kernel->setArg(idx++, stride_r);
+    kernel->setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel->setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel->setArg(idx++, stride_h_r);
+    kernel->setArg(idx++, stride_w_r);
    kernel->setArg(idx++, static_cast<int32_t>(align_h));
    kernel->setArg(idx++, static_cast<int32_t>(align_w));
    kernel->setArg(idx++, static_cast<int32_t>(padding_h));
@@ -152,34 +157,43 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
  MACE_CHECK_NOTNULL(input);
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);
-  if (!from_caffe_) {
+  std::vector<int> paddings(2);
+  std::vector<index_t> output_shape(4);
+  if (paddings_.empty()) {
+    paddings = std::vector<int>(2, 0);
    if (output_shape_.size() != 4) {
      MACE_CHECK_NOTNULL(output_shape_tensor);
      MACE_CHECK(output_shape_tensor->size() == 4);
      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
      auto output_shape_data =
          output_shape_tensor->data<int32_t>();
-      output_shape_ =
+      output_shape =
          std::vector<index_t>(output_shape_data, output_shape_data + 4);
+    } else {
+      output_shape = output_shape_;
    }
-    paddings_.clear();
-    paddings_ = std::vector<int>(2, 0);
-    CalcDeconvPaddingAndInputSize(input->shape().data(), filter->shape().data(),
-                                  strides_, padding_type_, output_shape_.data(),
-                                  paddings_.data());
+    CalcDeconvPaddingAndInputSize(input->shape().data(),
+                                  filter->shape().data(),
+                                  strides_.data(),
+                                  padding_type_,
+                                  output_shape.data(),
+                                  paddings.data());
  } else {
-    output_shape_.clear();
-    output_shape_ = std::vector<index_t>(4, 0);
-    CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
-                         strides_, output_shape_.data(), paddings_.data());
+    paddings = paddings_;
+    output_shape = std::vector<index_t>(4, 0);
+    CalcDeconvOutputSize(input->shape().data(),
+                         filter->shape().data(),
+                         strides_.data(),
+                         output_shape.data(),
+                         paddings.data());
  }
  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape_, output_image_shape));
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));

-  return Deconv2dOpencl(&kernel_, input, filter, bias, strides_[0],
-                        paddings_.data(), activation_, relux_max_limit_,
+  return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(),
+                        paddings.data(), activation_, relux_max_limit_,
                        DataTypeToEnum<T>::value, &input_shape_, output, future,
                        &kwg_size_, &kernel_error_);
 }

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -19,23 +19,22 @@

 #include "mace/core/operator.h"
 #include "mace/kernels/deconv_2d.h"
-#include "mace/ops/conv_pool_2d_base.h"

 namespace mace {
 namespace ops {

 template <DeviceType D, typename T>
-class Deconv2dOp : public ConvPool2dOpBase<D, T> {
+class Deconv2dOp : public Operator<D, T> {
 public:
  Deconv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->strides_.data(),
-                 this->padding_type_,
-                 this->paddings_,
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetRepeatedArgs<int>("strides"),
+                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+                     "padding", static_cast<int>(SAME))),
+                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
                 OperatorBase::GetRepeatedArgs<index_t>("output_shape"),
                 kernels::ActivationType::NOOP,
-                 0.0f,
-                 OperatorBase::GetOptionalArg<bool>("from_caffe", false)) {}
+                 0.0f) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -41,7 +41,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
-  bool from_caffe = output_shape.size() != 4;
  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -55,7 +54,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
        .AddIntArg("padding", padding)
        .AddIntsArg("padding_values", padding_size)
        .AddIntsArg("output_shape", output_shape)
-        .AddIntArg("from_caffe", from_caffe)
        .Finalize(net.NewOperatorDef());

    net.RunOp(D);
@@ -74,7 +72,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
        .AddIntArg("padding", padding)
        .AddIntsArg("padding_values", padding_size)
        .AddIntsArg("output_shape", output_shape)
-        .AddIntArg("from_caffe", from_caffe)
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -89,7 +86,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
 template <DeviceType D>
 void TestNHWCSimple3x3SAME_S1() {
  RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 1, Padding::SAME,
-                   {0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
+                   {}, {1, 3, 3, 3}, {3, 3, 3, 1},
                   {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                   {1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
@@ -101,7 +98,7 @@ void TestNHWCSimple3x3SAME_S1() {
                   {1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
                                  9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4});
  RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::SAME,
-                   {0, 0}, {1, 3, 3, 3}, {3, 3, 3, 1},
+                   {}, {1, 3, 3, 3}, {3, 3, 3, 1},
                   {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
                    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
                   {1, 3, 3, 3}, {54,  66,  78,  126, 147, 168, 130, 146, 162,
@@ -119,7 +116,7 @@ void TestNHWCSimple3x3SAME_S1() {
 template <DeviceType D>
 void TestNHWCSimple3x3SAME_S2() {
  RunTestSimple<D>(
-      {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {0, 0},
+      {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::SAME, {},
      {1, 6, 6, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
      {1, 6, 6, 3},
@@ -137,7 +134,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4,
                     2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1});
  RunTestSimple<D>(
-      {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {0, 0},
+      {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 2, Padding::SAME, {},
      {1, 6, 6, 3}, {3, 3, 3, 1},
      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
@@ -167,7 +164,7 @@ template <DeviceType D>
 void TestNHWCSimple3x3SAME_S2_1() {
  RunTestSimple<D>(
      {1, 3, 3, 1}, {12, 18, 12, 18, 27, 18, 12, 18, 12}, 2, Padding::SAME,
-      {0, 0}, {1, 5, 5, 3}, {3, 3, 3, 1},
+      {}, {1, 5, 5, 3}, {3, 3, 3, 1},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
      {1, 5, 5, 3},
@@ -181,7 +178,7 @@ void TestNHWCSimple3x3SAME_S2_1() {
 template <DeviceType D>
 void TestNHWCSimple3x3VALID_S2() {
  RunTestSimple<D>(
-      {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::VALID, {0, 0},
+      {1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, 2, Padding::VALID, {},
      {1, 7, 7, 3}, {3, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
      {1, 7, 7, 3},
@@ -197,7 +194,7 @@ void TestNHWCSimple3x3VALID_S2() {
 template <DeviceType D>
 void TestNHWCSimple3x3VALID_S1() {
  RunTestSimple<D>(
-      {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {0, 0},
+      {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, 1, Padding::VALID, {},
      {1, 5, 5, 3}, {3, 3, 3, 1},
      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
@@ -212,7 +209,7 @@ void TestNHWCSimple3x3VALID_S1() {

 template <DeviceType D>
 void TestNHWCSimple2x2SAME() {
-  RunTestSimple<D>({1, 2, 2, 1}, {1, 1, 1, 1}, 1, Padding::SAME, {0, 0},
+  RunTestSimple<D>({1, 2, 2, 1}, {1, 1, 1, 1}, 1, Padding::SAME, {},
                   {1, 2, 2, 1}, {3, 3, 1, 1},
                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
                   {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f});
@@ -221,7 +218,7 @@ void TestNHWCSimple2x2SAME() {
 template <DeviceType D>
 void TestNHWCSimple2x2VALID() {
  RunTestSimple<D>(
-      {1, 2, 2, 1}, {1, 1, 1, 1}, 2, Padding::VALID, {0, 0}, {1, 5, 5, 1},
+      {1, 2, 2, 1}, {1, 1, 1, 1}, 2, Padding::VALID, {}, {1, 5, 5, 1},
      {3, 3, 1, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
      {1, 5, 5, 1},
      {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
@@ -333,7 +330,6 @@ void TestComplexDeconvNxNS12(const int batch,
      paddings.push_back(padding);
      paddings.push_back(padding);
    }
-    bool from_caffe = output_shape.size() != 4;
    // Construct graph
    OpDefBuilder("Deconv2D", "Deconv2dTest")
        .Input("InputNCHW")
@@ -344,7 +340,6 @@ void TestComplexDeconvNxNS12(const int batch,
        .AddIntArg("padding", type)
        .AddIntsArg("padding_values", paddings)
        .AddIntsArg("output_shape", output_shape)
-        .AddIntArg("from_caffe", from_caffe)
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());

@@ -375,7 +370,6 @@ void TestComplexDeconvNxNS12(const int batch,
        .AddIntArg("padding", type)
        .AddIntsArg("padding_values", paddings)
        .AddIntsArg("output_shape", output_shape)
-        .AddIntArg("from_caffe", from_caffe)
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run on device

--- a/mace/ops/shape.h
+++ b/mace/ops/shape.h
@@ -39,8 +39,20 @@ class ShapeOp : public Operator<D, T> {
    Tensor::MappingGuard output_guard(output);
    int32_t *output_data = output->mutable_data<int32_t>();

-    for (index_t i = 0; i < input->dim_size(); ++i) {
-      output_data[i] = input->dim(i);
+    const int data_format =
+        OperatorBase::GetOptionalArg<int>("data_format", 0);
+    if (input->dim_size() == 4 &&
+        D == DeviceType::CPU &&
+        data_format == DataFormat::NCHW) {
+      // transpose NCHW to NHWC for cpu runtime
+      output_data[0] = static_cast<int32_t>(input->dim(0));
+      output_data[1] = static_cast<int32_t>(input->dim(2));
+      output_data[2] = static_cast<int32_t>(input->dim(3));
+      output_data[3] = static_cast<int32_t>(input->dim(1));
+    } else {
+      for (unsigned int i = 0; i < input->dim_size(); ++i) {
+        output_data[i] = static_cast<int32_t>(input->dim(i));
+      }
    }
    SetFutureDefaultWaitFn(future);


--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -167,7 +167,6 @@ class MaceKeyword(object):
    mace_transpose_b_str = 'transpose_b'
    mace_op_data_type_str = 'T'
    mace_offset_str = 'offset'
-    mace_from_caffe_str = 'from_caffe'
    mace_opencl_max_image_size = "opencl_max_image_size"
    mace_seperate_buffer_str = 'seperate_buffer'
    mace_scalar_input_index_str = 'scalar_input_index'

--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -414,10 +414,6 @@ class CaffeConverter(base_converter.ConverterInterface):

        op.type = MaceOp.Deconv2D.name

-        from_caffe_arg = op.arg.add()
-        from_caffe_arg.name = MaceKeyword.mace_from_caffe_str
-        from_caffe_arg.i = 1
-
        self.add_stride_pad_kernel_arg(param, op)
        # dilation is specific for convolution in caffe
        dilations = [1, 1]