Resize feed and fetch variables before infer shape, and reimplement scale kernel according to fluid

a9133cf4 · hjchen2 · ef17bf2f · a9133cf4 · a9133cf4 · a9133cf4
4 changed file
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -56,8 +56,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
-  const auto &blocks = program_desc_->Blocks();
+  // resize feed and fetch list
+  // should init feed and fetch variables before infer shape
+  InitFeedFetchList();

+  const auto &blocks = program_desc_->Blocks();
  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
  for (int j = 0; j < ops.size(); ++j) {
@@ -79,8 +82,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  } else {
    InitMemory();
  }
-  // resize feed and fetch list
-  InitFeedFetchList();

 #ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});

--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -15,131 +15,55 @@ limitations under the License. */
 #ifdef SCALE_OP

 #include "operators/kernel/scale_kernel.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {

-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
 template <>
 void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-
-  const vector<float> scales = param.Scales();
-  bool has_bias = param.HasBias();
-
-  const int dim_size = input_x->dims().size();
-  switch (dim_size) {
-    case 1: {
-      const int input_width = input_x->dims()[0];
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-        #pragma omp parallel for
-        for (int w = 0; w < input_width; w++) {
-          out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
-        }
-      } else {
-        #pragma omp parallel for
-        for (int w = 0; w < input_width; w++) {
-          out_ptr[w] = input_x_ptr[w] * scales[w];
-        }
-      }
-    } break;
-    case 2: {
-      const int input_height = input_x->dims()[0];
-      const int input_width = input_x->dims()[1];
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-        #pragma omp parallel for
-        for (int h = 0; h < input_height; ++h) {
-          const float *iptr = input_x_ptr + h * input_width;
-          float *optr = out_ptr + h * input_width;
-          for (int w = 0; w < input_width; ++w) {
-            optr[w] = iptr[w] * scales[w] + biases[w];
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int h = 0; h < input_height; ++h) {
-          const float *iptr = input_x_ptr + h * input_width;
-          float *optr = out_ptr + h * input_width;
-          for (int w = 0; w < input_width; ++w) {
-            optr[w] = iptr[w] * scales[w];
-          }
-        }
-      }
-    } break;
-    case 3: {
-      const int chan_size = input_x->dims()[0];
-      const int input_height = input_x->dims()[1];
-      const int input_width = input_x->dims()[2];
-      int size = input_width * input_height;
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-
-        #pragma omp parallel for
-        for (int c = 0; c < chan_size; ++c) {
-          const float *iptr = input_x_ptr + c * size;
-          float *optr = out_ptr + c * size;
-          for (int i = 0; i < size; ++i) {
-            optr[i] = iptr[i] * scales[c] + biases[c];
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int c = 0; c < chan_size; ++c) {
-          const float *iptr = input_x_ptr + c * size;
-          float *optr = out_ptr + c * size;
-          for (int i = 0; i < size; ++i) {
-            optr[i] = iptr[i] * scales[c];
-          }
-        }
-      }
-    } break;
-
-    case 4: {
-      const int batch_size = input_x->dims()[0];
-      const int chan_size = input_x->dims()[0];
-      const int input_height = input_x->dims()[1];
-      const int input_width = input_x->dims()[2];
-      int size = input_width * input_height;
-
-      if (has_bias) {
-        const vector<float> biases = param.Biases();
-
-        #pragma omp parallel for
-        for (int b = 0; b < batch_size; ++b) {
-          for (int c = 0; c < chan_size; ++c) {
-            const float *iptr = input_x_ptr + b * c * size;
-            float *optr = out_ptr + b * c * size;
-            for (int i = 0; i < size; ++i) {
-              optr[i] = iptr[i] * scales[c] + biases[c];
-            }
-          }
-        }
-      } else {
-        #pragma omp parallel for
-        for (int b = 0; b < batch_size; ++b) {
-          for (int c = 0; c < chan_size; ++c) {
-            const float *iptr = input_x_ptr + b * c * size;
-            float *optr = out_ptr + b * c * size;
-            for (int i = 0; i < size; ++i) {
-              optr[i] = iptr[i] * scales[c];
-            }
-          }
-        }
-      }
-    } break;
-    default:
-      break;
+  const auto input = param.InputX();
+  auto output = param.Out();
+  const float scale = param.Scale();
+  const float bias = param.Bias();
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int i = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  for (; i < output->numel() - 15; i += 16) {
+    float32x4_t _in0 = vld1q_f32(input_data);
+    float32x4_t _in1 = vld1q_f32(input_data + 4);
+    float32x4_t _in2 = vld1q_f32(input_data + 8);
+    float32x4_t _in3 = vld1q_f32(input_data + 12);
+    _in0 = vmlaq_f32(vbias, vscale, _in0);
+    _in1 = vmlaq_f32(vbias, vscale, _in1);
+    _in2 = vmlaq_f32(vbias, vscale, _in2);
+    _in3 = vmlaq_f32(vbias, vscale, _in3);
+    vst1q_f32(output_data, _in0);
+    vst1q_f32(output_data + 4, _in1);
+    vst1q_f32(output_data + 8, _in2);
+    vst1q_f32(output_data + 12, _in3);
+    input_data += 16;
+    output_data += 16;
+  }
+  for (; i < output->numel() - 3; i += 4) {
+    float32x4_t _in0 = vld1q_f32(input_data);
+    _in0 = vmlaq_f32(vbias, vscale, _in0);
+    vst1q_f32(output_data, _in0);
+    input_data += 4;
+    output_data += 4;
+  }
+#endif
+  for (; i < output->numel(); ++i, ++output_data, ++input_data) {
+    *output_data = scale * (*input_data) + bias;
  }
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/arm/sequence_pool_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_pool_kernel.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "common/types.h"
 #include "operators/kernel/sequence_kernels.h"
 #include "operators/math/pooling.h"
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif  // __ARM_NEON__

@@ -44,7 +44,7 @@ void SequencePoolImpl(const framework::LoDTensor &input,
    if (width == 1) {
      float max = -std::numeric_limits<float>::max();
      int remain_h = height;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      int loop = remain_h >> 2;
      remain_h = remain_h & 0x3;
      float32x4_t __max4 = math::vPoolInitq_f32<MAX>();
@@ -67,11 +67,11 @@ void SequencePoolImpl(const framework::LoDTensor &input,
      in_ptr += width;
      int remain_h = height - 1;
      int remain_w_start = 0;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      remain_w_start = width & 0xfffc;
 #endif  // __ARM_NEON__
      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
        for (int w = 0; w < width; w += 4) {
          float32x4_t __in = vld1q_f32(in_ptr + w);
          float32x4_t __out = vld1q_f32(out_ptr + w);
@@ -104,7 +104,7 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
    if (width == 1) {
      float sum = 0.f;
      int remain_h = height;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      int loop = remain_h >> 2;
      remain_h = remain_h & 0x3;
      float32x4_t __sum4 = vdupq_n_f32(0.f);
@@ -126,12 +126,12 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
      in_ptr += width;
      int remain_h = height - 1;
      int remain_w_start = 0;
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      int loop_w = width >> 2;
      remain_w_start = width & 0xfffc;
 #endif  // __ARM_NEON__
      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
        for (int w = 0; w < width - 3; w += 4) {
          float32x4_t __in = vld1q_f32(in_ptr + w);
          float32x4_t __out = vld1q_f32(out_ptr + w);

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1533,36 +1533,24 @@ class ScaleParam : public OpParam {
             const AttributeMap &attrs, Scope *scope)
      : OpParam(inputs, outputs, attrs, scope) {
    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
    out_ = OutFrom<GType>(outputs, *scope);
-    inplace_ = GetAttr<bool>("inplace", attrs);
-    has_bias_ = GetAttr<bool>("has_bias", attrs);
-    scales_ = GetAttr<vector<float>>("scales", attrs);
-    biases_ = GetAttr<vector<float>>("biases", attrs);
+    scale_ = GetAttr<float>("scale", attrs);
+    bias_ = GetAttr<float>("bias", attrs);
  }

  const GType *InputX() const { return input_x_; }

-  const GType *InputBias() const { return input_bias_; }
-
  GType *Out() const { return out_; }

-  const bool &Inplace() const { return inplace_; }
-
-  const bool &HasBias() const { return has_bias_; }
+  const float Scale() const { return scale_; }

-  const vector<float> &Scales() const { return scales_; }
-
-  const vector<float> &Biases() const { return biases_; }
+  const float Bias() const { return bias_; }

 private:
  GType *input_x_;
-  GType *input_bias_;
  GType *out_;
-  bool inplace_;
-  bool has_bias_;
-  vector<float> scales_;
-  vector<float> biases_;
+  float scale_;
+  float bias_;
 };
 #endif

@@ -2933,8 +2921,8 @@ class QuantizeParam : public OpParam {
  // if offine scale or not
  bool offline_ = false;
  // round method type
-  RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  // RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
+  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
+  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
 };
 #endif