Fix quantize kernel while pad != 0

405630c7 · hjchen2 · ee79fcf4 · 405630c7 · 405630c7 · 405630c7
4 changed file
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -379,8 +379,8 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
        const float *x3 = input3 + h * input_w;
        int loop = input_w >> 4;
        int remain = input_w & 0xF;
-        int pad_loop = paddings[1] >> 1;
-        int pad_remain = paddings[1] & 0x1;
+        int pad_loop = paddings[1] >> 1;  // (paddings[1] << 1) >> 2
+        int pad_remain = (paddings[1] << 1) & 0x3;
        int remain_steps = remain;
        asm volatile(
            "vdup.f32   q0, %[scale]        \n"
@@ -596,7 +596,7 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "store_pad_2w_%=:               \n"
            "cmp        %[pad_remain], #2   \n"
-            "ble        store_pad_1w_%=     \n"
+            "blt        store_pad_1w_%=     \n"
            "vst1.16    {d0[0]}, [%[y0]]!   \n"
            "vst1.16    {d0[0]}, [%[y1]]!   \n"
            "vst1.16    {d0[0]}, [%[y2]]!   \n"
@@ -605,7 +605,7 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "store_pad_1w_%=:               \n"
            "cmp        %[pad_remain], #1   \n"
-            "ble        end_%=              \n"
+            "blt        end_%=              \n"
            "vst1.8    {d0[0]}, [%[y0]]!    \n"
            "vst1.8    {d0[0]}, [%[y1]]!    \n"
            "vst1.8    {d0[0]}, [%[y2]]!    \n"
@@ -669,8 +669,8 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
        const float *x0 = input0 + h * input_w;
        int loop = input_w >> 4;
        int remain = input_w & 0xF;
-        int pad_loop = paddings[1] >> 1;
-        int pad_remain = paddings[1] & 0x1;
+        int pad_loop = paddings[1] >> 1;  // (paddings[1] << 1) >> 2
+        int pad_remain = (paddings[1] << 1) & 0x3;
        asm volatile(
            "vdup.f32   q0, %[scale]        \n"
            "cmp        %[loop], #0         \n"
@@ -754,14 +754,14 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "pad_remain_%=:                 \n"
            "cmp        %[pad_remain], #2   \n"
-            "ble        store_pad_1w_%=     \n"
+            "blt        store_pad_1w_%=     \n"
            "vst1.16    {d0[0]}, [%[y0]]!   \n"
            "sub        %[pad_remain], #2   \n"

            "store_pad_1w_%=:               \n"
            "cmp        %[pad_remain], #1   \n"
-            "ble        end_%=              \n"
-            "vst1.8    {d0[0]}, [%[y0]]!    \n"
+            "blt        end_%=              \n"
+            "vst1.8     {d0[0]}, [%[y0]]!   \n"
            "end_%=:                        \n"
            : [x0] "+r"(x0), [y0] "+r"(y0), [loop] "+r"(loop),
              [remain] "+r"(remain), [pad_loop] "+r"(pad_loop),
@@ -795,10 +795,10 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
  // only support int8 currently
  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
-  //  const auto &paddings = param.paddings_;
-  std::vector<int> paddings = {0, 0};
-  //  const auto padding_val = param.padding_val_;
-  int8_t padding_val = 127;
+  const auto &paddings = param.paddings_;
+  // std::vector<int> paddings = {0, 0};
+  // const auto padding_val = param.padding_val_;
+  int8_t padding_val = 0;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
      quantize_round_to_even(input, scale, paddings, padding_val, output);

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2536,6 +2536,11 @@ class QuantizeParam : public OpParam {
    if (HasAttr("round_type", attrs)) {
      round_type_ = GetAttr<RoundType>("round_type", attrs);
    }
+    // get paddings
+    paddings_ = std::vector<int>({0, 0});
+    if (HasAttr("paddings", attrs)) {
+      paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    }
  }

 public:

--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -22,7 +22,10 @@ namespace operators {

 template <typename DeviceType, typename T>
 void QuantizeOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
+  auto input_dims = this->param_.input_->dims();
+  const std::vector<int> &paddings = this->param_.paddings_;
+  input_dims[2] += 2 * paddings[0];
+  input_dims[3] += 2 * paddings[1];
  this->param_.output_->Resize(input_dims);
  auto scale_dims = framework::make_ddim(std::vector<int>{1});
  this->param_.online_scale_->Resize(scale_dims);

--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
@@ -12,58 +12,128 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/quantize_op.h"

 namespace paddle_mobile {
+namespace round {
+enum RoundType {
+  RoundToEven = 0,
+  RoundAwayZero = 1,
+  RoundTowardsZero = 2,
+};
+}

-static float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<const float>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = std::abs(x[i]);
-    if (value > max_abs) {
-      max_abs = value;
+template <round::RoundType T>
+static int8_t Round(float x);
+
+template <>
+static int8_t Round<round::RoundAwayZero>(float x) {
+  return std::round(x);
+}
+
+template <>
+static int8_t Round<round::RoundTowardsZero>(float x) {
+  return int8_t(x);
+}
+
+template <>
+static int8_t Round<round::RoundToEven>(float x) {
+  int8_t ret = 0;
+  float v = std::round(x);
+  int32_t q = (int32_t)v;
+  if (abs(abs(q - x) - 0.5) > 0) {
+    ret = q;
+  } else {
+    if (abs(q) % 2 == 0) {
+      ret = q;
+    } else {
+      ret = q + ((q > 0) ? -1 : 1);
    }
  }
-  return max_abs;
+  return ret;
 }

-static void quantize_round_to_even(const Tensor *input, const float scale,
-                                   Tensor *output) {
+template <round::RoundType T>
+static void quantize(const Tensor *input, const float scale, const int pad,
+                     const int8_t pad_val, Tensor *output) {
+  int batch_size = input->dims()[0];
+  int channels = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+  size_t input_spatial = input_h * input_w;
+  size_t output_spatial = output_h * output_w;
  const float *x = input->data<const float>();
  int8_t *y = output->mutable_data<int8_t>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = x[i] * scale;
-    float v = round(value);
-    int32_t q = (int32_t)v;
-    if (abs(abs(q - value) - 0.5) > 0) {
-      y[i] = q;
-    } else {
-      if (abs(q) % 2 == 0) {
-        y[i] = q;
-      } else {
-        y[i] = q + ((q > 0) ? -1 : 1);
+
+  std::cout << "pad: " << pad << ", pad_val: " << int(pad_val) << std::endl;
+  for (int nc = 0; nc < batch_size * channels; ++nc) {
+    const float *xh = x + nc * input_spatial;
+    int8_t *yh = y + nc * output_spatial;
+    // pad top
+    for (int h = 0; h < pad; ++h, yh += output_w) {
+      for (int w = 0; w < output_w; ++w) {
+        yh[w] = pad_val;
+      }
+    }
+    for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) {
+      // pad left
+      for (int w = 0; w < pad; ++w) {
+        yh[w] = pad_val;
+      }
+      for (int w = 0; w < input_w; ++w) {
+        yh[w + pad] = Round<T>(xh[w] * scale);
+      }
+      // pad right
+      for (int w = 0; w < pad; ++w) {
+        yh[pad + input_w + w] = pad_val;
+      }
+    }
+    // pad bottom
+    for (int h = 0; h < pad; ++h, yh += output_w) {
+      for (int w = 0; w < output_w; ++w) {
+        yh[w] = pad_val;
      }
    }
  }
 }

-static void quantize_round_to_nearest(const Tensor *input, const float scale,
-                                      Tensor *output) {
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
  const float *x = input->data<const float>();
-  int8_t *y = output->mutable_data<int8_t>();
  size_t size = input->numel();
  for (size_t i = 0; i < size; ++i) {
-    y[i] = round(x[i] * scale);
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
  }
+  return max_abs;
 }

-int TestQuqntizeOp() {
-  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+int TestQuqntizeOp(int argc, char *argv[]) {
+  if (argc < 5) {
+    std::cout
+        << "Usage: ./test-quantize-op batch_size channel height width [pad]"
+        << std::endl;
+    return 1;
+  }
+  int pad = 0;
+  int batch_size = atoi(argv[1]);
+  int channel = atoi(argv[2]);
+  int height = atoi(argv[3]);
+  int width = atoi(argv[4]);
+  if (argc == 6) {
+    pad = atoi(argv[5]);
+  }
+  std::cout << "batch_size: " << batch_size << ", channel: " << channel
+            << ", height: " << height << ", width: " << width << std::endl;
+  framework::DDim dim =
+      framework::make_ddim({batch_size, channel, height, width});

  VariableNameMap inputs;
  VariableNameMap outputs;
@@ -80,6 +150,7 @@ int TestQuqntizeOp() {
  auto output_scale_var = scope.get()->Var("output_scale");

  framework::AttributeMap attrs;
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad, pad}));
  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
                                                   attrs, scope);
  op->InferShape();
@@ -96,10 +167,11 @@ int TestQuqntizeOp() {
                        output_scale_cmp, output_scale_data[0]);

  framework::Tensor output_cmp;
-  output_cmp.Resize(dim);
+  output_cmp.Resize(output->dims());
  float scale = 127 / output_scale_cmp;
-  // quantize_round_to_even(input, scale, &output_cmp);
-  quantize_round_to_nearest(input, scale, &output_cmp);
+  // quantize<round::RoundToEven>(input, scale, pad, 0, &output_cmp);
+  // quantize<round::RoundAwayZero>(input, scale, pad, 0, &output_cmp);
+  quantize<round::RoundTowardsZero>(input, scale, pad, 0, &output_cmp);
  int8_t *output_cmp_data = output_cmp.data<int8_t>();
  for (int i = 0; i < output->numel(); ++i) {
    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
@@ -113,4 +185,6 @@ int TestQuqntizeOp() {

 }  // namespace paddle_mobile

-int main() { return paddle_mobile::TestQuqntizeOp(); }
+int main(int argc, char *argv[]) {
+  return paddle_mobile::TestQuqntizeOp(argc, argv);
+}