From a9133cf4e82056b0dcd7225ca66bded5a05f1c03 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Thu, 14 Mar 2019 18:42:16 +0800
Subject: [PATCH] Resize feed and fetch variables before infer shape, and
 reimplement scale kernel according to fluid

---
 src/framework/executor.cpp                    |   7 +-
 src/operators/kernel/arm/scale_kernel.cpp     | 158 +++++-------------
 .../kernel/arm/sequence_pool_kernel.cpp       |  14 +-
 src/operators/op_param.h                      |  28 +---
 4 files changed, 60 insertions(+), 147 deletions(-)
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index b5fab192aa..a2047e845a 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -56,8 +56,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
       use_optimize_ ? program_.optimizeProgram : program_.originProgram;
   PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                         "program_desc_ should not be nullptr");
-  const auto &blocks = program_desc_->Blocks();
+  // resize feed and fetch list
+  // should init feed and fetch variables before infer shape
+  InitFeedFetchList();
 
+  const auto &blocks = program_desc_->Blocks();
   std::shared_ptr<BlockDesc> block_desc = blocks[0];
   std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
   for (int j = 0; j < ops.size(); ++j) {
@@ -79,8 +82,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
   } else {
     InitMemory();
   }
-  // resize feed and fetch list
-  InitFeedFetchList();
 
 #ifdef PADDLE_MOBILE_FPGA
   program_.scope->EraseVars({"feed", "fetch"});
diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp
index bded56275f..823f1e30cb 100644
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -15,131 +15,55 @@ limitations under the License. */
 #ifdef SCALE_OP
 
 #include "operators/kernel/scale_kernel.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
 
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
 template <>
 void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-
-  const vector<float> scales = param.Scales();
-  bool has_bias = param.HasBias();
-
-  const int dim_size = input_x->dims().size();
-  switch (dim_size) {
-    case 1: {
-      const int input_width = input_x->dims()[0];
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-        #pragma omp parallel for
-        for (int w = 0; w < input_width; w++) {
-          out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
-        }
-      } else {
-        #pragma omp parallel for
-        for (int w = 0; w < input_width; w++) {
-          out_ptr[w] = input_x_ptr[w] * scales[w];
-        }
-      }
-    } break;
-    case 2: {
-      const int input_height = input_x->dims()[0];
-      const int input_width = input_x->dims()[1];
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-        #pragma omp parallel for
-        for (int h = 0; h < input_height; ++h) {
-          const float *iptr = input_x_ptr + h * input_width;
-          float *optr = out_ptr + h * input_width;
-          for (int w = 0; w < input_width; ++w) {
-            optr[w] = iptr[w] * scales[w] + biases[w];
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int h = 0; h < input_height; ++h) {
-          const float *iptr = input_x_ptr + h * input_width;
-          float *optr = out_ptr + h * input_width;
-          for (int w = 0; w < input_width; ++w) {
-            optr[w] = iptr[w] * scales[w];
-          }
-        }
-      }
-    } break;
-    case 3: {
-      const int chan_size = input_x->dims()[0];
-      const int input_height = input_x->dims()[1];
-      const int input_width = input_x->dims()[2];
-      int size = input_width * input_height;
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-
-        #pragma omp parallel for
-        for (int c = 0; c < chan_size; ++c) {
-          const float *iptr = input_x_ptr + c * size;
-          float *optr = out_ptr + c * size;
-          for (int i = 0; i < size; ++i) {
-            optr[i] = iptr[i] * scales[c] + biases[c];
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int c = 0; c < chan_size; ++c) {
-          const float *iptr = input_x_ptr + c * size;
-          float *optr = out_ptr + c * size;
-          for (int i = 0; i < size; ++i) {
-            optr[i] = iptr[i] * scales[c];
-          }
-        }
-      }
-    } break;
-
-    case 4: {
-      const int batch_size = input_x->dims()[0];
-      const int chan_size = input_x->dims()[0];
-      const int input_height = input_x->dims()[1];
-      const int input_width = input_x->dims()[2];
-      int size = input_width * input_height;
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-
-        #pragma omp parallel for
-        for (int b = 0; b < batch_size; ++b) {
-          for (int c = 0; c < chan_size; ++c) {
-            const float *iptr = input_x_ptr + b * c * size;
-            float *optr = out_ptr + b * c * size;
-            for (int i = 0; i < size; ++i) {
-              optr[i] = iptr[i] * scales[c] + biases[c];
-            }
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int b = 0; b < batch_size; ++b) {
-          for (int c = 0; c < chan_size; ++c) {
-            const float *iptr = input_x_ptr + b * c * size;
-            float *optr = out_ptr + b * c * size;
-            for (int i = 0; i < size; ++i) {
-              optr[i] = iptr[i] * scales[c];
-            }
-          }
-        }
-      }
-    } break;
-    default:
-      break;
+  const auto input = param.InputX();
+  auto output = param.Out();
+  const float scale = param.Scale();
+  const float bias = param.Bias();
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int i = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  for (; i < output->numel() - 15; i += 16) {
+    float32x4_t _in0 = vld1q_f32(input_data);
+    float32x4_t _in1 = vld1q_f32(input_data + 4);
+    float32x4_t _in2 = vld1q_f32(input_data + 8);
+    float32x4_t _in3 = vld1q_f32(input_data + 12);
+    _in0 = vmlaq_f32(vbias, vscale, _in0);
+    _in1 = vmlaq_f32(vbias, vscale, _in1);
+    _in2 = vmlaq_f32(vbias, vscale, _in2);
+    _in3 = vmlaq_f32(vbias, vscale, _in3);
+    vst1q_f32(output_data, _in0);
+    vst1q_f32(output_data + 4, _in1);
+    vst1q_f32(output_data + 8, _in2);
+    vst1q_f32(output_data + 12, _in3);
+    input_data += 16;
+    output_data += 16;
+  }
+  for (; i < output->numel() - 3; i += 4) {
+    float32x4_t _in0 = vld1q_f32(input_data);
+    _in0 = vmlaq_f32(vbias, vscale, _in0);
+    vst1q_f32(output_data, _in0);
+    input_data += 4;
+    output_data += 4;
+  }
+#endif
+  for (; i < output->numel(); ++i, ++output_data, ++input_data) {
+    *output_data = scale * (*input_data) + bias;
   }
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/sequence_pool_kernel.cpp b/src/operators/kernel/arm/sequence_pool_kernel.cpp
index 352158b973..8326c55515 100644
--- a/src/operators/kernel/arm/sequence_pool_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_pool_kernel.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "common/types.h"
 #include "operators/kernel/sequence_kernels.h"
 #include "operators/math/pooling.h"
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif  // __ARM_NEON__
 
@@ -44,7 +44,7 @@ void SequencePoolImpl(const framework::LoDTensor &input,
     if (width == 1) {
       float max = -std::numeric_limits<float>::max();
       int remain_h = height;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
       int loop = remain_h >> 2;
       remain_h = remain_h & 0x3;
       float32x4_t __max4 = math::vPoolInitq_f32<MAX>();
@@ -67,11 +67,11 @@ void SequencePoolImpl(const framework::LoDTensor &input,
       in_ptr += width;
       int remain_h = height - 1;
       int remain_w_start = 0;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
       remain_w_start = width & 0xfffc;
 #endif  // __ARM_NEON__
       for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
         for (int w = 0; w < width; w += 4) {
           float32x4_t __in = vld1q_f32(in_ptr + w);
           float32x4_t __out = vld1q_f32(out_ptr + w);
@@ -104,7 +104,7 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
     if (width == 1) {
       float sum = 0.f;
       int remain_h = height;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
       int loop = remain_h >> 2;
       remain_h = remain_h & 0x3;
       float32x4_t __sum4 = vdupq_n_f32(0.f);
@@ -126,12 +126,12 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
       in_ptr += width;
       int remain_h = height - 1;
       int remain_w_start = 0;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
       int loop_w = width >> 2;
       remain_w_start = width & 0xfffc;
 #endif  // __ARM_NEON__
       for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
         for (int w = 0; w < width - 3; w += 4) {
           float32x4_t __in = vld1q_f32(in_ptr + w);
           float32x4_t __out = vld1q_f32(out_ptr + w);
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 6b14ef4736..ead4de0514 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1533,36 +1533,24 @@ class ScaleParam : public OpParam {
              const AttributeMap &attrs, Scope *scope)
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
     out_ = OutFrom<GType>(outputs, *scope);
-    inplace_ = GetAttr<bool>("inplace", attrs);
-    has_bias_ = GetAttr<bool>("has_bias", attrs);
-    scales_ = GetAttr<vector<float>>("scales", attrs);
-    biases_ = GetAttr<vector<float>>("biases", attrs);
+    scale_ = GetAttr<float>("scale", attrs);
+    bias_ = GetAttr<float>("bias", attrs);
   }
 
   const GType *InputX() const { return input_x_; }
 
-  const GType *InputBias() const { return input_bias_; }
-
   GType *Out() const { return out_; }
 
-  const bool &Inplace() const { return inplace_; }
-
-  const bool &HasBias() const { return has_bias_; }
+  const float Scale() const { return scale_; }
 
-  const vector<float> &Scales() const { return scales_; }
-
-  const vector<float> &Biases() const { return biases_; }
+  const float Bias() const { return bias_; }
 
  private:
   GType *input_x_;
-  GType *input_bias_;
   GType *out_;
-  bool inplace_;
-  bool has_bias_;
-  vector<float> scales_;
-  vector<float> biases_;
+  float scale_;
+  float bias_;
 };
 #endif
 
@@ -2933,8 +2921,8 @@ class QuantizeParam : public OpParam {
   // if offine scale or not
   bool offline_ = false;
   // round method type
-  RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  // RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
+  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
+  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
 };
 #endif
 
-- 
GitLab