From a9133cf4e82056b0dcd7225ca66bded5a05f1c03 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 14 Mar 2019 18:42:16 +0800 Subject: [PATCH] Resize feed and fetch variables before infer shape, and reimplement scale kernel according to fluid --- src/framework/executor.cpp | 7 +- src/operators/kernel/arm/scale_kernel.cpp | 158 +++++------------- .../kernel/arm/sequence_pool_kernel.cpp | 14 +- src/operators/op_param.h | 28 +--- 4 files changed, 60 insertions(+), 147 deletions(-) diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index b5fab192aa..a2047e845a 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -56,8 +56,11 @@ Executor::Executor(const Program &program, use_optimize_ ? program_.optimizeProgram : program_.originProgram; PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr, "program_desc_ should not be nullptr"); - const auto &blocks = program_desc_->Blocks(); + // resize feed and fetch list + // should init feed and fetch variables before infer shape + InitFeedFetchList(); + const auto &blocks = program_desc_->Blocks(); std::shared_ptr block_desc = blocks[0]; std::vector> ops = block_desc->Ops(); for (int j = 0; j < ops.size(); ++j) { @@ -79,8 +82,6 @@ Executor::Executor(const Program &program, } else { InitMemory(); } - // resize feed and fetch list - InitFeedFetchList(); #ifdef PADDLE_MOBILE_FPGA program_.scope->EraseVars({"feed", "fetch"}); diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp index bded56275f..823f1e30cb 100644 --- a/src/operators/kernel/arm/scale_kernel.cpp +++ b/src/operators/kernel/arm/scale_kernel.cpp @@ -15,131 +15,55 @@ limitations under the License. */ #ifdef SCALE_OP #include "operators/kernel/scale_kernel.h" +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif namespace paddle_mobile { namespace operators { -/* - * @b 特化到具体平台的实现, param 从 op 层传入 - * */ template <> void ScaleKernel::Compute(const ScaleParam ¶m) { - const auto *input_x = param.InputX(); - auto *input_x_ptr = input_x->data(); - auto *out = param.Out(); - auto *out_ptr = out->mutable_data(); - - const vector scales = param.Scales(); - bool has_bias = param.HasBias(); - - const int dim_size = input_x->dims().size(); - switch (dim_size) { - case 1: { - const int input_width = input_x->dims()[0]; - if (has_bias) { - const vector biases = param.Biases(); - #pragma omp parallel for - for (int w = 0; w < input_width; w++) { - out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w]; - } - } else { - #pragma omp parallel for - for (int w = 0; w < input_width; w++) { - out_ptr[w] = input_x_ptr[w] * scales[w]; - } - } - } break; - case 2: { - const int input_height = input_x->dims()[0]; - const int input_width = input_x->dims()[1]; - - if (has_bias) { - const vector biases = param.Biases(); - #pragma omp parallel for - for (int h = 0; h < input_height; ++h) { - const float *iptr = input_x_ptr + h * input_width; - float *optr = out_ptr + h * input_width; - for (int w = 0; w < input_width; ++w) { - optr[w] = iptr[w] * scales[w] + biases[w]; - } - } - } else { - #pragma omp parallel for - for (int h = 0; h < input_height; ++h) { - const float *iptr = input_x_ptr + h * input_width; - float *optr = out_ptr + h * input_width; - for (int w = 0; w < input_width; ++w) { - optr[w] = iptr[w] * scales[w]; - } - } - } - } break; - case 3: { - const int chan_size = input_x->dims()[0]; - const int input_height = input_x->dims()[1]; - const int input_width = input_x->dims()[2]; - int size = input_width * input_height; - - if (has_bias) { - const vector biases = param.Biases(); - - #pragma omp parallel for - for (int c = 0; c < chan_size; ++c) { - const float *iptr = input_x_ptr + c * size; - float *optr = out_ptr + c * size; - for (int i = 0; i < size; ++i) { - optr[i] = iptr[i] * scales[c] + biases[c]; - } - } - } else { - #pragma omp parallel for - for (int c = 0; c < chan_size; ++c) { - const float *iptr = input_x_ptr + c * size; - float *optr = out_ptr + c * size; - for (int i = 0; i < size; ++i) { - optr[i] = iptr[i] * scales[c]; - } - } - } - } break; - - case 4: { - const int batch_size = input_x->dims()[0]; - const int chan_size = input_x->dims()[0]; - const int input_height = input_x->dims()[1]; - const int input_width = input_x->dims()[2]; - int size = input_width * input_height; - - if (has_bias) { - const vector biases = param.Biases(); - - #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int c = 0; c < chan_size; ++c) { - const float *iptr = input_x_ptr + b * c * size; - float *optr = out_ptr + b * c * size; - for (int i = 0; i < size; ++i) { - optr[i] = iptr[i] * scales[c] + biases[c]; - } - } - } - } else { - #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int c = 0; c < chan_size; ++c) { - const float *iptr = input_x_ptr + b * c * size; - float *optr = out_ptr + b * c * size; - for (int i = 0; i < size; ++i) { - optr[i] = iptr[i] * scales[c]; - } - } - } - } - } break; - default: - break; + const auto input = param.InputX(); + auto output = param.Out(); + const float scale = param.Scale(); + const float bias = param.Bias(); + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + int i = 0; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + for (; i < output->numel() - 15; i += 16) { + float32x4_t _in0 = vld1q_f32(input_data); + float32x4_t _in1 = vld1q_f32(input_data + 4); + float32x4_t _in2 = vld1q_f32(input_data + 8); + float32x4_t _in3 = vld1q_f32(input_data + 12); + _in0 = vmlaq_f32(vbias, vscale, _in0); + _in1 = vmlaq_f32(vbias, vscale, _in1); + _in2 = vmlaq_f32(vbias, vscale, _in2); + _in3 = vmlaq_f32(vbias, vscale, _in3); + vst1q_f32(output_data, _in0); + vst1q_f32(output_data + 4, _in1); + vst1q_f32(output_data + 8, _in2); + vst1q_f32(output_data + 12, _in3); + input_data += 16; + output_data += 16; + } + for (; i < output->numel() - 3; i += 4) { + float32x4_t _in0 = vld1q_f32(input_data); + _in0 = vmlaq_f32(vbias, vscale, _in0); + vst1q_f32(output_data, _in0); + input_data += 4; + output_data += 4; + } +#endif + for (; i < output->numel(); ++i, ++output_data, ++input_data) { + *output_data = scale * (*input_data) + bias; } } + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/sequence_pool_kernel.cpp b/src/operators/kernel/arm/sequence_pool_kernel.cpp index 352158b973..8326c55515 100644 --- a/src/operators/kernel/arm/sequence_pool_kernel.cpp +++ b/src/operators/kernel/arm/sequence_pool_kernel.cpp @@ -21,7 +21,7 @@ limitations under the License. */ #include "common/types.h" #include "operators/kernel/sequence_kernels.h" #include "operators/math/pooling.h" -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) #include #endif // __ARM_NEON__ @@ -44,7 +44,7 @@ void SequencePoolImpl(const framework::LoDTensor &input, if (width == 1) { float max = -std::numeric_limits::max(); int remain_h = height; -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) int loop = remain_h >> 2; remain_h = remain_h & 0x3; float32x4_t __max4 = math::vPoolInitq_f32(); @@ -67,11 +67,11 @@ void SequencePoolImpl(const framework::LoDTensor &input, in_ptr += width; int remain_h = height - 1; int remain_w_start = 0; -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) remain_w_start = width & 0xfffc; #endif // __ARM_NEON__ for (int h = 0; h < remain_h; ++h) { -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) for (int w = 0; w < width; w += 4) { float32x4_t __in = vld1q_f32(in_ptr + w); float32x4_t __out = vld1q_f32(out_ptr + w); @@ -104,7 +104,7 @@ void SequencePoolImpl(const framework::LoDTensor &input, if (width == 1) { float sum = 0.f; int remain_h = height; -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) int loop = remain_h >> 2; remain_h = remain_h & 0x3; float32x4_t __sum4 = vdupq_n_f32(0.f); @@ -126,12 +126,12 @@ void SequencePoolImpl(const framework::LoDTensor &input, in_ptr += width; int remain_h = height - 1; int remain_w_start = 0; -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) int loop_w = width >> 2; remain_w_start = width & 0xfffc; #endif // __ARM_NEON__ for (int h = 0; h < remain_h; ++h) { -#ifdef __ARM_NEON__ +#if defined(__ARM_NEON__) || defined(__ARM_NEON) for (int w = 0; w < width - 3; w += 4) { float32x4_t __in = vld1q_f32(in_ptr + w); float32x4_t __out = vld1q_f32(out_ptr + w); diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 6b14ef4736..ead4de0514 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1533,36 +1533,24 @@ class ScaleParam : public OpParam { const AttributeMap &attrs, Scope *scope) : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, *scope); - input_bias_ = InputBiasFrom(inputs, *scope); out_ = OutFrom(outputs, *scope); - inplace_ = GetAttr("inplace", attrs); - has_bias_ = GetAttr("has_bias", attrs); - scales_ = GetAttr>("scales", attrs); - biases_ = GetAttr>("biases", attrs); + scale_ = GetAttr("scale", attrs); + bias_ = GetAttr("bias", attrs); } const GType *InputX() const { return input_x_; } - const GType *InputBias() const { return input_bias_; } - GType *Out() const { return out_; } - const bool &Inplace() const { return inplace_; } - - const bool &HasBias() const { return has_bias_; } + const float Scale() const { return scale_; } - const vector &Scales() const { return scales_; } - - const vector &Biases() const { return biases_; } + const float Bias() const { return bias_; } private: GType *input_x_; - GType *input_bias_; GType *out_; - bool inplace_; - bool has_bias_; - vector scales_; - vector biases_; + float scale_; + float bias_; }; #endif @@ -2933,8 +2921,8 @@ class QuantizeParam : public OpParam { // if offine scale or not bool offline_ = false; // round method type - RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; - // RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; + // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; + RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; }; #endif -- GitLab