提交 a9133cf4 编写于 作者: H hjchen2

Resize feed and fetch variables before infer shape, and reimplement scale kernel according to fluid

上级 ef17bf2f
......@@ -56,8 +56,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
use_optimize_ ? program_.optimizeProgram : program_.originProgram;
PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
"program_desc_ should not be nullptr");
const auto &blocks = program_desc_->Blocks();
// resize feed and fetch list
// should init feed and fetch variables before infer shape
InitFeedFetchList();
const auto &blocks = program_desc_->Blocks();
std::shared_ptr<BlockDesc> block_desc = blocks[0];
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < ops.size(); ++j) {
......@@ -79,8 +82,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
} else {
InitMemory();
}
// resize feed and fetch list
InitFeedFetchList();
#ifdef PADDLE_MOBILE_FPGA
program_.scope->EraseVars({"feed", "fetch"});
......
......@@ -15,131 +15,55 @@ limitations under the License. */
#ifdef SCALE_OP
#include "operators/kernel/scale_kernel.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
template <>
void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
const auto *input_x = param.InputX();
auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
const vector<float> scales = param.Scales();
bool has_bias = param.HasBias();
const int dim_size = input_x->dims().size();
switch (dim_size) {
case 1: {
const int input_width = input_x->dims()[0];
if (has_bias) {
const vector<float> biases = param.Biases();
#pragma omp parallel for
for (int w = 0; w < input_width; w++) {
out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
}
} else {
#pragma omp parallel for
for (int w = 0; w < input_width; w++) {
out_ptr[w] = input_x_ptr[w] * scales[w];
}
}
} break;
case 2: {
const int input_height = input_x->dims()[0];
const int input_width = input_x->dims()[1];
if (has_bias) {
const vector<float> biases = param.Biases();
#pragma omp parallel for
for (int h = 0; h < input_height; ++h) {
const float *iptr = input_x_ptr + h * input_width;
float *optr = out_ptr + h * input_width;
for (int w = 0; w < input_width; ++w) {
optr[w] = iptr[w] * scales[w] + biases[w];
}
}
} else {
#pragma omp parallel for
for (int h = 0; h < input_height; ++h) {
const float *iptr = input_x_ptr + h * input_width;
float *optr = out_ptr + h * input_width;
for (int w = 0; w < input_width; ++w) {
optr[w] = iptr[w] * scales[w];
}
}
}
} break;
case 3: {
const int chan_size = input_x->dims()[0];
const int input_height = input_x->dims()[1];
const int input_width = input_x->dims()[2];
int size = input_width * input_height;
if (has_bias) {
const vector<float> biases = param.Biases();
#pragma omp parallel for
for (int c = 0; c < chan_size; ++c) {
const float *iptr = input_x_ptr + c * size;
float *optr = out_ptr + c * size;
for (int i = 0; i < size; ++i) {
optr[i] = iptr[i] * scales[c] + biases[c];
}
const auto input = param.InputX();
auto output = param.Out();
const float scale = param.Scale();
const float bias = param.Bias();
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int i = 0;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4_t vscale = vdupq_n_f32(scale);
float32x4_t vbias = vdupq_n_f32(bias);
for (; i < output->numel() - 15; i += 16) {
float32x4_t _in0 = vld1q_f32(input_data);
float32x4_t _in1 = vld1q_f32(input_data + 4);
float32x4_t _in2 = vld1q_f32(input_data + 8);
float32x4_t _in3 = vld1q_f32(input_data + 12);
_in0 = vmlaq_f32(vbias, vscale, _in0);
_in1 = vmlaq_f32(vbias, vscale, _in1);
_in2 = vmlaq_f32(vbias, vscale, _in2);
_in3 = vmlaq_f32(vbias, vscale, _in3);
vst1q_f32(output_data, _in0);
vst1q_f32(output_data + 4, _in1);
vst1q_f32(output_data + 8, _in2);
vst1q_f32(output_data + 12, _in3);
input_data += 16;
output_data += 16;
}
for (; i < output->numel() - 3; i += 4) {
float32x4_t _in0 = vld1q_f32(input_data);
_in0 = vmlaq_f32(vbias, vscale, _in0);
vst1q_f32(output_data, _in0);
input_data += 4;
output_data += 4;
}
} else {
#pragma omp parallel for
for (int c = 0; c < chan_size; ++c) {
const float *iptr = input_x_ptr + c * size;
float *optr = out_ptr + c * size;
for (int i = 0; i < size; ++i) {
optr[i] = iptr[i] * scales[c];
}
}
}
} break;
case 4: {
const int batch_size = input_x->dims()[0];
const int chan_size = input_x->dims()[0];
const int input_height = input_x->dims()[1];
const int input_width = input_x->dims()[2];
int size = input_width * input_height;
if (has_bias) {
const vector<float> biases = param.Biases();
#pragma omp parallel for
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < chan_size; ++c) {
const float *iptr = input_x_ptr + b * c * size;
float *optr = out_ptr + b * c * size;
for (int i = 0; i < size; ++i) {
optr[i] = iptr[i] * scales[c] + biases[c];
}
}
}
} else {
#pragma omp parallel for
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < chan_size; ++c) {
const float *iptr = input_x_ptr + b * c * size;
float *optr = out_ptr + b * c * size;
for (int i = 0; i < size; ++i) {
optr[i] = iptr[i] * scales[c];
}
}
}
}
} break;
default:
break;
#endif
for (; i < output->numel(); ++i, ++output_data, ++input_data) {
*output_data = scale * (*input_data) + bias;
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "common/types.h"
#include "operators/kernel/sequence_kernels.h"
#include "operators/math/pooling.h"
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif // __ARM_NEON__
......@@ -44,7 +44,7 @@ void SequencePoolImpl(const framework::LoDTensor &input,
if (width == 1) {
float max = -std::numeric_limits<float>::max();
int remain_h = height;
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain_h >> 2;
remain_h = remain_h & 0x3;
float32x4_t __max4 = math::vPoolInitq_f32<MAX>();
......@@ -67,11 +67,11 @@ void SequencePoolImpl(const framework::LoDTensor &input,
in_ptr += width;
int remain_h = height - 1;
int remain_w_start = 0;
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
remain_w_start = width & 0xfffc;
#endif // __ARM_NEON__
for (int h = 0; h < remain_h; ++h) {
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
for (int w = 0; w < width; w += 4) {
float32x4_t __in = vld1q_f32(in_ptr + w);
float32x4_t __out = vld1q_f32(out_ptr + w);
......@@ -104,7 +104,7 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
if (width == 1) {
float sum = 0.f;
int remain_h = height;
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain_h >> 2;
remain_h = remain_h & 0x3;
float32x4_t __sum4 = vdupq_n_f32(0.f);
......@@ -126,12 +126,12 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
in_ptr += width;
int remain_h = height - 1;
int remain_w_start = 0;
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop_w = width >> 2;
remain_w_start = width & 0xfffc;
#endif // __ARM_NEON__
for (int h = 0; h < remain_h; ++h) {
#ifdef __ARM_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
for (int w = 0; w < width - 3; w += 4) {
float32x4_t __in = vld1q_f32(in_ptr + w);
float32x4_t __out = vld1q_f32(out_ptr + w);
......
......@@ -1533,36 +1533,24 @@ class ScaleParam : public OpParam {
const AttributeMap &attrs, Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
input_bias_ = InputBiasFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope);
inplace_ = GetAttr<bool>("inplace", attrs);
has_bias_ = GetAttr<bool>("has_bias", attrs);
scales_ = GetAttr<vector<float>>("scales", attrs);
biases_ = GetAttr<vector<float>>("biases", attrs);
scale_ = GetAttr<float>("scale", attrs);
bias_ = GetAttr<float>("bias", attrs);
}
const GType *InputX() const { return input_x_; }
const GType *InputBias() const { return input_bias_; }
GType *Out() const { return out_; }
const bool &Inplace() const { return inplace_; }
const bool &HasBias() const { return has_bias_; }
const float Scale() const { return scale_; }
const vector<float> &Scales() const { return scales_; }
const vector<float> &Biases() const { return biases_; }
const float Bias() const { return bias_; }
private:
GType *input_x_;
GType *input_bias_;
GType *out_;
bool inplace_;
bool has_bias_;
vector<float> scales_;
vector<float> biases_;
float scale_;
float bias_;
};
#endif
......@@ -2933,8 +2921,8 @@ class QuantizeParam : public OpParam {
// if offine scale or not
bool offline_ = false;
// round method type
RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
// RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
// RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
};
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册