Merge branch 'fix-crop-bug' into 'master'

BUG: fix crop layer bugs. See merge request !1025

Merge branch 'fix-crop-bug' into 'master'
BUG: fix crop layer bugs. See merge request !1025
fafb7998 · 李超 · 41a6d777 · a6578ae8 · fafb7998 · fafb7998
8 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -175,7 +175,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
        // NHWC -> NCHW
        input_shape =
            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
-        input_data_format = DataFormat::NCHW;
      }
    }
  }

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -15,21 +15,34 @@
 #include <memory>

 #include "mace/core/operator.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/crop.h"
 #endif  // MACE_ENABLE_OPENCL
-#include "mace/utils/memory.h"

 namespace mace {
 namespace ops {

 template <DeviceType D, class T>
-class CropOp : public Operation {
+class CropOp;
+
+template <class T>
+class CropOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 2)),
-        offset_(Operation::GetRepeatedArgs<int>("offset")) {}
+        offset_(Operation::GetRepeatedArgs<int>("offset")) {
+    MACE_CHECK(offset_.size() == 4,
+               "crop op only supports 4-dims inputs now.");
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df) {
+      // NHWC -> NCHW
+      offset_ = TransposeShape<int, int>(offset_, {0, 3, 1, 2});
+    }
+  }
+

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -47,21 +60,13 @@ class CropOp : public Operation {

    std::vector<index_t> output_shape(input0->shape());
    for (index_t i = 0; i < in0_dims; ++i) {
-      int32_t crop_offset = 0;
-      index_t new_size = input0->dim(i);
-      if (i >= axis_) {
-        new_size = input1->dim(i);
-        if (offset_.size() == 1) {
-          crop_offset = offset_[0];
-        } else if (offset_.size() > 1) {
-          crop_offset = offset_[i - axis_];
-        }
-        MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i))
-          << "the crop for dimension" << i << "is out of bound with size"
-          << input1->dim(i) << "and offset" << crop_offset;
+      if (offset_[i] >= 0) {
+        output_shape[i] = input1->dim(i);
+        offsets[i] = offset_[i];
+        MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+          << "the crop for dimension " << i << " is out of bound with size "
+          << input1->dim(i) << " and offset " << offsets[i];
      }
-      output_shape[i] = new_size;
-      offsets[i] = crop_offset;
    }
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
    T *output_data = output->mutable_data<T>();
@@ -103,7 +108,6 @@ class CropOp : public Operation {
  }

 private:
-  const int axis_;
  std::vector<int> offset_;
 };

@@ -113,10 +117,9 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context) {
-    const int axis = Operation::GetOptionalArg<int>("axis", 2);
    if (context->device()->gpu_runtime()->UseImageMemory()) {
      kernel_ = make_unique<opencl::image::CropKernel<T>>(
-          axis, Operation::GetRepeatedArgs<int>("offset"));
+          Operation::GetRepeatedArgs<int>("offset"));
    } else {
      MACE_NOT_IMPLEMENTED;
    }

--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -21,107 +21,80 @@ namespace test {

 namespace {
 template <DeviceType D, typename T>
-void CropHelper(int iters, int crop_axis, int dim1, int offset) {
+void CropHelper(int iters,
+                const std::vector<index_t> &shape0,
+                const std::vector<index_t> &shape1,
+                int crop_axis,
+                int offset) {
  mace::testing::StopTiming();

  OpsTestNet net;
-  OpDefBuilder("Crop", "CropBM")
-      .Input("Input0")
-      .Input("Input1")
-      .AddIntArg("axis", crop_axis)
-      .AddIntsArg("offset", {offset})
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

-  // Add input data
-  const int kDim0 = 100;
-  net.AddRandomInput<DeviceType::CPU, T>("Input0", {1, kDim0, dim1, dim1, });
-  net.AddRandomInput<DeviceType::CPU, T>("Input1",
-                                         {1, kDim0 / 2, dim1 / 2, dim1 / 2});
+  std::vector<int> offsets(4, -1);

-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+  for (int i = crop_axis; i < 4; ++i) {
+    offsets[i] = offset;
  }
-  const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
-  testing::BytesProcessed(tot * sizeof(T));
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.RunOp(D);
-  }
-}
-}  // namespace
-
-#define MACE_BM_CROP_CPU_MACRO(AXIS, DIM, OFFSET)                     \
-  static void MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET(int iters) { \
-    CropHelper<DeviceType::CPU, float>(iters, AXIS, DIM, OFFSET);     \
-  }                                                               \
-  MACE_BENCHMARK(MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET)
-
-MACE_BM_CROP_CPU_MACRO(1, 256, 3);
-MACE_BM_CROP_CPU_MACRO(2, 256, 3);
-MACE_BM_CROP_CPU_MACRO(3, 512, 3);
-MACE_BM_CROP_CPU_MACRO(2, 512, 6);
-
-namespace {
-template <typename T>
-void OpenCLCropHelper(int iters,
-                      const std::vector<index_t> &shape0,
-                      const std::vector<index_t> &shape1,
-                      int crop_axis,
-                      int offset) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;

-  // Add input data
-  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
-  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
+  if (D == DeviceType::CPU) {
+    auto input_shape0 = TransposeShape<index_t, index_t>(shape0, {0, 3, 1, 2});
+    auto input_shape1 = TransposeShape<index_t, index_t>(shape1, {0, 3, 1, 2});
+    net.AddRandomInput<D, float>("Input0", input_shape0);
+    net.AddRandomInput<D, float>("Input1", input_shape1);
+  } else if (D == DeviceType::GPU) {
+    // Add input data
+    net.AddRandomInput<D, T>("Input0", shape0);
+    net.AddRandomInput<D, T>("Input1", shape1);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

  OpDefBuilder("Crop", "CropBM")
      .Input("Input0")
      .Input("Input1")
-      .AddIntArg("axis", crop_axis)
-      .AddIntsArg("offset", {offset})
+      .AddIntsArg("offset", offsets)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.RunOp(DeviceType::GPU);
+  net.Setup(D);
+  for (int i = 0; i < 1; ++i) {
+    net.Run();
  }

  const int64_t tot =
      static_cast<int64_t>(iters) *
-      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
+      (net.GetTensor("Input0")->size());
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(DeviceType::GPU);
+    net.Run();
  }
 }
 }  // namespace

-#define MACE_BM_CROP_GPU_MACRO(N, H, W, C, AXIS, OFFSET, TYPE)            \
-  static void MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET##\
-  _##TYPE(int iters) {                                                        \
-    std::vector<index_t> shape0 = {N, H, W, C};                              \
-    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};              \
-    OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
-  }                                                                          \
-  MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
-  ##_##TYPE)
-
-MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, float);
+#define MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, DEVICE, TYPE)     \
+  static void MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET## \
+  _##DEVICE##_##TYPE(int iters) {                                      \
+    std::vector<index_t> shape0 = {N, H, W, C};                        \
+    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};        \
+    CropHelper<DEVICE, TYPE>(iters, shape0, shape1, AXIS, OFFSET);     \
+  }                                                                    \
+  MACE_BENCHMARK(MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
+  ##_##DEVICE##_##TYPE)
+
+#define MACE_BM_CROP(N, H, W, C, AXIS, OFFSET)               \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, CPU, float);  \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, float);  \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, half);
+
+MACE_BM_CROP(4, 32, 32, 32, 2, 4);
+MACE_BM_CROP(8, 32, 32, 64, 1, 0);
+MACE_BM_CROP(8, 32, 32, 128, 0, 0);
+MACE_BM_CROP(8, 32, 32, 256, 2, 4);

-MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, half);

 }  // namespace test
 }  // namespace ops

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -26,7 +26,6 @@ void RunCrop(const std::vector<index_t> &input_shape,
             const std::vector<float> &input_data,
             const std::vector<index_t> &input_shape2,
             const std::vector<int> &offset,
-             const int axis,
             const std::vector<index_t> &expected_shape,
             const std::vector<float> &expected_data) {
  OpsTestNet net;
@@ -39,7 +38,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
        .Input("Input1")
        .Output("Output")
        .AddIntsArg("offset", offset)
-        .AddIntArg("axis", axis)
+        .AddIntArg("has_data_format", 1)
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input0",
@@ -55,7 +54,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
        .Input("InputNCHW1")
        .Output("OutputNCHW")
        .AddIntsArg("offset", offset)
-        .AddIntArg("axis", axis)
+        .AddIntArg("has_data_format", 1)
        .Finalize(net.NewOperatorDef());
  }

@@ -113,7 +112,7 @@ TEST_F(CropTest, SimpleCPU) {
                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
-                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2,
+                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1},
                            {1, 5, 5, 3},
                            {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                            2.0, 2.0, 2.0, 3.0, 3.0, 3.0,
@@ -168,7 +167,7 @@ TEST_F(CropTest, SimpleGPU) {
                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
-                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2,
+                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1},
                            {1, 5, 5, 3},
                            {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                            2.0, 2.0, 2.0, 3.0, 3.0, 3.0,

--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -34,16 +34,14 @@ template <typename T>
 class CropKernel : public OpenCLCropKernel {
 public:
  explicit CropKernel(
-      const int axis,
      const std::vector<int> &offset)
-      : axis_(axis), offset_(offset) {}
+      : offset_(offset) {}
  MaceStatus Compute(
      OpContext *context,
      const std::vector<const Tensor *> &input_list,
      Tensor *output) override;

 private:
-  const int axis_;
  std::vector<int> offset_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
@@ -68,57 +66,14 @@ MaceStatus CropKernel<T>::Compute(
  std::vector<int32_t> offsets(4, 0);

  std::vector<index_t> output_shape(input0->shape());
-  switch (axis_) {
-    case 0:
-      if (offset_.size() == 1) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 4) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[2];
-        offsets[2] = offset_[3];
-        offsets[3] = offset_[1];
-      }
-      for (int i = 0; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 1:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 3) {
-        offsets[1] = offset_[1];
-        offsets[2] = offset_[2];
-        offsets[3] = offset_[0];
-      }
-      for (int i = 1; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 2:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-      } else if (offset_.size() == 2) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[1];
-      }
-      output_shape[1] = input1->dim(1);
-      output_shape[2] = input1->dim(2);
-      break;
-    case 3:
-      if (offset_.size() == 1) {
-        offsets[2] = offset_[0];
-      }
-      output_shape[2] = input1->dim(2);
-      break;
-    default:
-      MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
-      break;
+  for (index_t i = 0; i < in0_dims; ++i) {
+    if (offset_[i] >= 0) {
+      output_shape[i] = input1->dim(i);
+      offsets[i] = offset_[i];
+      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+        << "the crop for dimension " << i << " is out of bound with size "
+        << input1->dim(i) << " and offset " << offsets[i];
+    }
  }
  MACE_CHECK(offsets[3] % 4 == 0,
             "MACE opencl only supports cropping channel"

--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -552,18 +552,20 @@ class CaffeConverter(base_converter.ConverterInterface):
        param = caffe_op.layer.crop_param
        op.type = MaceOp.Crop.name

-        axis_arg = op.arg.add()
-        axis_arg.name = MaceKeyword.mace_axis_str
-        axis_arg.i = 2
-        if param.HasField(MaceKeyword.mace_axis_str):
-            axis_arg.i = param.axis
-        axis_arg.i = 4 + axis_arg.i if axis_arg.i < 0 else axis_arg.i
+        axis = param.axis
+        axis = 4 + axis if axis < 0 else axis
+        offset_value = -1 * np.ones(4, dtype=np.int32)
+        offset_len = len(param.offset)
+        if offset_len == 1:
+            while axis < 4:
+                offset_value[axis] = param.offset[0]
+                axis += 1
+        else:
+            offset_value[axis:] = param.offset
+
        offset_arg = op.arg.add()
        offset_arg.name = MaceKeyword.mace_offset_str
-        if len(param.offset) > 0:
-            offset_arg.ints.extend(list(param.offset))
-        else:
-            offset_arg.i = 0
+        offset_arg.ints.extend(offset_value)

    def convert_concat(self, caffe_op):
        op = self.convert_general_op(caffe_op)

--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
@@ -224,7 +224,12 @@ class ShapeInference(object):

    def infer_shape_crop(self, op):
        mace_check(len(op.input) == 2, "crop layer needs two inputs")
-        output_shape = self._output_shape_cache[op.input[1]]
+        output_shape = self._output_shape_cache[op.input[0]]
+        input1_shape = self._output_shape_cache[op.input[1]]
+        offsets = ConverterUtil.get_arg(op, MaceKeyword.mace_offset_str).ints
+        for i in range(len(offsets)):
+            if offsets[i] >= 0:
+                output_shape[i] = input1_shape[i]
        self.add_output_shape(op, [output_shape])

    def infer_shape_channel_shuffle(self, op):

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1012,7 +1012,8 @@ class Transformer(base_converter.ConverterInterface):
                        elif filter_format == DataFormat.OIHW:
                            weight.dims[:] = weight.dims[:] + [1, 1]
                        else:
-                            mace_check("FC does not support filter format %s",
+                            mace_check(False,
+                                       "FC does not support filter format %s" %
                                       filter_format.name)
        return False

@@ -1084,6 +1085,16 @@ class Transformer(base_converter.ConverterInterface):
                            new_axises.sort()
                            arg.ints[:] = []
                            arg.ints.extend(new_axises)
+            elif op.type == MaceOp.Crop.name:
+                offset_arg = ConverterUtil.get_arg(op,
+                                                   MaceKeyword.mace_offset_str)
+                mace_check(offset_arg and
+                           ConverterUtil.data_format(op) == DataFormat.NCHW and
+                           len(op.output_shape[0].dims) == 4,
+                           "MACE only support crop with NCHW format")
+                print("Transpose crop args: %s(%s)"
+                      % (op.name, op.type))
+                self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])

            # transpose op output shape
            data_format = ConverterUtil.data_format(op)
@@ -1147,7 +1158,7 @@ class Transformer(base_converter.ConverterInterface):
            elif filter_format == DataFormat.OIHW:
                transpose_order = [0, 2, 3, 1]
            else:
-                mace_check("Quantize model does not support conv "
+                mace_check(False, "Quantize model does not support conv "
                           "filter format: %s" % filter_format.name)

            for op in net.op: