提交 f6a7712f 编写于 作者: H hjchen2

Refactor dequant fusion kernels to support more fusion patterns

上级 f163d146
......@@ -72,6 +72,7 @@ const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
......@@ -144,7 +145,8 @@ std::unordered_map<
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
......
......@@ -87,10 +87,19 @@ enum PMStatus {
};
enum RoundType {
ROUND_UNK = 0,
ROUND_NEAREST_AWAY_ZERO = 1,
ROUND_NEAREST_TOWARDS_ZERO = 2,
ROUND_NEAREST_TO_EVEN = 3
ROUND_NEAREST_AWAY_ZERO = 0,
ROUND_NEAREST_TOWARDS_ZERO = 1,
ROUND_NEAREST_TO_EVEN = 2,
};
enum ActivationType {
Linear = 0,
Relu = 1,
Relu6 = 2,
PRelu = 3,
LeakyRelu = 4,
Tanh = 5,
Sigmoid = 6,
};
extern const char *G_OP_TYPE_CONV;
......@@ -139,6 +148,7 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
......
......@@ -234,6 +234,10 @@ LOAD_OP1(quantize, CPU);
#ifdef DEQUANT_OP
LOAD_OP1(dequantize, CPU);
#endif
#ifdef FUSION_DEQUANT_BN_OP
LOAD_OP1(fusion_dequant_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_bn);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_OP
LOAD_OP1(fusion_dequant_add_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
......
......@@ -220,7 +220,16 @@ void Node::Folder(
}
} else {
for (auto &op_output : this->op_desc_->outputs_) {
op_desc->outputs_.emplace(op_output.first, op_output.second);
auto output_key = op_output.first;
if (change->find(this->type_) != change->end()) {
const auto change_pairs = (*change)[this->type_];
for (const auto &target : change_pairs) {
if (target.first == output_key) {
output_key = target.second;
}
}
}
op_desc->outputs_.emplace(output_key, op_output.second);
}
for (auto &output : this->outputs_) {
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_add_bn_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
template <typename DeviceType, typename T>
class FusionDequantAddBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>,
DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
public:
FusionDequantAddBNReluOp(const std::string &type,
......@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>,
DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -44,7 +44,8 @@ class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......@@ -90,7 +91,8 @@ class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......
......@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_BN_RELU_OP
#include "operators/fusion_dequant_bn_relu_op.h"
#include "operators/fusion_dequant_bn_op.h"
namespace paddle_mobile {
namespace operators {
#ifdef FUSION_DEQUANT_BN_OP
template <typename Dtype, typename T>
void FusionDequantBNOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename Dtype, typename T>
void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef FUSION_DEQUANT_BN_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
ops::FusionDequantBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
#endif
#endif
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_RELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
class FusionDequantBNMatcher : public framework::FusionOpMatcher {
public:
FusionDequantBNMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
virtual void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
};
#endif // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_BN_OP
template <typename DeviceType, typename T>
class FusionDequantBNOp : public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>> {
public:
FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
public:
FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDequantBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>> {
public:
FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
......@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "operators/kernel/dequant_add_bn_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
// elementwise add params
const Tensor *bias = param->bias_;
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *bias_ptr = bias->data<float>();
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
}
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
const int32_t *input = param.input_->data<int32_t>();
const float *bn_scale = param.bn_scale_->data<float>();
const float *bn_bias = param.bn_bias_->data<float>();
// dequantize params
const float activation_scale = param.activation_scale_->data<float>()[0];
const float weight_scale = param.weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param.output_->mutable_data<float>();
int batch_size = param.input_->dims()[0];
int channels = param.input_->dims()[1];
size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = scale * (dequant_scale * x[k]) + bias;
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DEQUANT_ADD_BN_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include <cmath>
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/math/activation.h"
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
......@@ -22,8 +23,10 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
const framework::Tensor *bias) {
......@@ -39,16 +42,20 @@ void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
float val = bias ? bias->data<float>()[c] : 0;
bn_bias_ptr[c] = inv_scale * (val - mean_ptr[c]) + bn_bias_ptr[c];
bn_bias_ptr[c] =
inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
}
}
#endif
#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
template <ActivationType Act>
void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
......@@ -79,7 +86,6 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __zero = vdupq_n_f32(0.f);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
......@@ -97,10 +103,10 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = vmaxq_f32(__zero, f0);
f1 = vmaxq_f32(__zero, f1);
f2 = vmaxq_f32(__zero, f2);
f3 = vmaxq_f32(__zero, f3);
f0 = math::vActiveq_f32(f0);
f1 = math::vActiveq_f32(f1);
f2 = math::vActiveq_f32(f2);
f3 = math::vActiveq_f32(f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
......@@ -108,32 +114,62 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = std::max(scale * (dequant_scale * x[k]) + bias, 0.f);
y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_OP
template <>
bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNKernel<CPU, float>::Compute(
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<Linear>(&param);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <>
bool FusionDequantBNReluKernel<CPU, float>::Init(
FusionDequantBNReluParam<CPU> *param) {
FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNReluKernel<CPU, float>::Compute(
const FusionDequantBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<Relu>(&param);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_OP
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<Linear>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <>
bool FusionDequantAddBNReluKernel<CPU, float>::Init(
FusionDequantAddBNReluParam<CPU> *param) {
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
......@@ -141,13 +177,14 @@ bool FusionDequantAddBNReluKernel<CPU, float>::Init(
template <>
void FusionDequantAddBNReluKernel<CPU, float>::Compute(
const FusionDequantAddBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<Relu>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template <RoundType R>
#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template <Activation Act, RoundType R>
void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
......@@ -167,9 +204,8 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
// if (param->is_static_) {
if (true) {
max_abs = param->offline_scale_->data<float>()[0];
max_abs = param->static_scale_;
float quant_scale = 127.f / max_abs;
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
......@@ -205,14 +241,18 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = math::vActiveq_f32(f0);
f1 = math::vActiveq_f32(f1);
f2 = math::vActiveq_f32(f2);
f3 = math::vActiveq_f32(f3);
f0 = vmulq_f32(__quant_scale, f0);
f1 = vmulq_f32(__quant_scale, f1);
f2 = vmulq_f32(__quant_scale, f2);
f3 = vmulq_f32(__quant_scale, f3);
int32x4_t q0 = math::vround_f32<R>(f0);
int32x4_t q1 = math::vround_f32<R>(f1);
int32x4_t q2 = math::vround_f32<R>(f2);
int32x4_t q3 = math::vround_f32<R>(f3);
int32x4_t q0 = math::vRoundq_f32<R>(f0);
int32x4_t q1 = math::vRoundq_f32<R>(f1);
int32x4_t q2 = math::vRoundq_f32<R>(f2);
int32x4_t q3 = math::vRoundq_f32<R>(f3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
......@@ -226,13 +266,14 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
float x_temp = scale * (dequant_scale * x[k]) + bias;
float x_temp =
math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
y[k] = math::Round<R>(x_temp * quant_scale);
}
}
}
} else {
// TODO(hjchen2)
// TODO
max_abs = std::max(max_abs, 1e-6f);
}
param->online_scale_->mutable_data<float>()[0] = max_abs;
......@@ -251,13 +292,13 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNQuantCompute<ROUND_NEAREST_TO_EVEN>(&param);
DequantBNQuantCompute<Linear, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNQuantCompute<ROUND_NEAREST_TOWARDS_ZERO>(&param);
DequantBNQuantCompute<Linear, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNQuantCompute<ROUND_NEAREST_AWAY_ZERO>(&param);
DequantBNQuantCompute<Linear, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
......@@ -267,105 +308,9 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template <RoundType R>
void DequantBNReluQuantCompute(
const FusionDequantAddBNReluQuantParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
// quantize params
Tensor *output_scale = param->online_scale_;
float max_abs = 0.f;
int8_t *output = param->output_->mutable_data<int8_t>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
// if (param->is_static_) {
if (true) {
max_abs = param->offline_scale_->data<float>()[0];
float quant_scale = 127.f / max_abs;
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
int8_t *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __zero = vdupq_n_f32(0.f);
float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = vmaxq_f32(__zero, f0);
f1 = vmaxq_f32(__zero, f1);
f2 = vmaxq_f32(__zero, f2);
f3 = vmaxq_f32(__zero, f3);
f0 = vmulq_f32(__quant_scale, f0);
f1 = vmulq_f32(__quant_scale, f1);
f2 = vmulq_f32(__quant_scale, f2);
f3 = vmulq_f32(__quant_scale, f3);
int32x4_t q0 = math::vround_f32<R>(f0);
int32x4_t q1 = math::vround_f32<R>(f1);
int32x4_t q2 = math::vround_f32<R>(f2);
int32x4_t q3 = math::vround_f32<R>(f3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
float x_temp = std::max(scale * (dequant_scale * x[k]) + bias, 0.f);
y[k] = math::Round<R>(x_temp * quant_scale);
}
}
}
} else {
// TODO(hjchen2)
max_abs = std::max(max_abs, 1e-6f);
}
param->online_scale_->mutable_data<float>()[0] = max_abs;
}
template <>
bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
FusionDequantAddBNReluQuantParam<CPU> *param) {
FusionDequantAddBNQuantParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
......@@ -373,16 +318,16 @@ bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
template <>
void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNReluQuantParam<CPU> &param) {
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNReluQuantCompute<ROUND_NEAREST_TO_EVEN>(&param);
DequantBNQuantCompute<Relu, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNReluQuantCompute<ROUND_NEAREST_TOWARDS_ZERO>(&param);
DequantBNQuantCompute<Relu, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNReluQuantCompute<ROUND_NEAREST_AWAY_ZERO>(&param);
DequantBNQuantCompute<Relu, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
......
......@@ -53,10 +53,10 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
r1 = vmulq_f32(r1, __scale);
r2 = vmulq_f32(r2, __scale);
r3 = vmulq_f32(r3, __scale);
int32x4_t q0 = math::vround_f32<R>(r0);
int32x4_t q1 = math::vround_f32<R>(r1);
int32x4_t q2 = math::vround_f32<R>(r2);
int32x4_t q3 = math::vround_f32<R>(r3);
int32x4_t q0 = math::vRoundq_f32<R>(r0);
int32x4_t q1 = math::vRoundq_f32<R>(r1);
int32x4_t q2 = math::vRoundq_f32<R>(r2);
int32x4_t q3 = math::vRoundq_f32<R>(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef RELU_OP
#include "operators/kernel/relu_kernel.h"
#include "common/types.h"
#include "operators/math/activation.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
......@@ -22,45 +24,13 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
enum ReluMode {
Relu = 0,
Relu6 = 1,
PRelu = 2,
LeakyRelu = 3,
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <ReluMode R = Relu>
inline float32x4_t vRelu_f32(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
return vmaxq_f32(__zero, x);
}
template <>
inline float32x4_t vRelu_f32<Relu6>(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
float32x4_t __six = vdupq_n_f32(6.f);
return vminq_f32(__six, vmaxq_f32(__zero, x));
}
#endif
template <ReluMode R = Relu>
inline float ReluFunc(const float &x) {
return std::max(x, 0.f);
}
template <>
inline float ReluFunc<Relu6>(const float &x) {
return std::min(std::max(x, 0.f), 6.f);
}
template <typename Dtype, ReluMode R>
template <typename Dtype, ActivationType Act>
struct ReluCompute {
void operator()(const Tensor *input, Tensor *output) {}
};
template <ReluMode R>
struct ReluCompute<float, R> {
template <ActivationType Act>
struct ReluCompute<float, Act> {
void operator()(const Tensor *input, Tensor *output) {
const float *x = input->data<float>();
float *y = output->mutable_data<float>();
......@@ -77,10 +47,10 @@ struct ReluCompute<float, R> {
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vRelu_f32<R>(r0);
r1 = vRelu_f32<R>(r1);
r2 = vRelu_f32<R>(r2);
r3 = vRelu_f32<R>(r3);
r0 = math::vActiveq_f32<Act>(r0);
r1 = math::vActiveq_f32<Act>(r1);
r2 = math::vActiveq_f32<Act>(r2);
r3 = math::vActiveq_f32<Act>(r3);
vst1q_f32(local_y, r0);
vst1q_f32(local_y + 4, r1);
vst1q_f32(local_y + 8, r2);
......@@ -90,7 +60,7 @@ struct ReluCompute<float, R> {
y += (loop << 4);
#endif
for (size_t i = 0; i < remain; ++i) {
y[i] = ReluFunc<R>(x[i]);
y[i] = math::Active<Act>(x[i]);
}
}
};
......
......@@ -20,48 +20,37 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
#define DECLARE_KERNEL(KernelClass, KernelParam) \
template <typename DeviceType, typename T> \
class KernelClass \
: public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
public: \
bool Init(KernelParam<DeviceType> *param); \
void Compute(const KernelParam<DeviceType> &param); \
};
#ifdef FUSION_DEQUANT_BN_OP
DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename DeviceType, typename T>
class FusionDequantBNReluKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantBNReluParam<DeviceType>> {
public:
void Compute(const FusionDequantBNReluParam<DeviceType> &param);
bool Init(FusionDequantBNReluParam<DeviceType> *param);
};
DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <typename DeviceType, typename T>
class FusionDequantAddBNReluKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNReluParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
};
#ifdef FUSION_DEQUANT_ADD_BN_OP
DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template <typename DeviceType, typename T>
class FusionDequantAddBNReluQuantKernel
: public framework::OpKernelBase<
DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNReluQuantParam<DeviceType> &param);
bool Init(FusionDequantAddBNReluQuantParam<DeviceType> *param);
};
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template <typename DeviceType, typename T>
class FusionDequantAddBNQuantKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNQuantParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNQuantParam<DeviceType> &param);
bool Init(FusionDequantAddBNQuantParam<DeviceType> *param);
};
DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
#endif
} // namespace operators
......
......@@ -14,24 +14,52 @@ limitations under the License. */
#pragma once
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "framework/operator.h"
#include "operators/op_param.h"
#include <algorithm>
#include <cmath>
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
namespace math {
template <typename DeviceType, typename T>
class FusionDequantAddBNKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNParam<DeviceType> &param);
bool Init(FusionDequantAddBNParam<DeviceType> *param);
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <ActivationType Act = Linear>
inline float32x4_t vActiveq_f32(const float32x4_t &x) {
return x;
}
} // namespace operators
} // namespace paddle_mobile
template <>
inline float32x4_t vActiveq_f32<Relu>(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
return vmaxq_f32(x, __zero);
}
template <>
inline float32x4_t vActiveq_f32<Relu6>(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
float32x4_t __six = vdupq_n_f32(6.f);
return vminq_f32(vmaxq_f32(x, __zero), __six);
}
#endif
template <ActivationType Act = Linear>
inline float Active(const float &x) {
return x;
}
template <>
inline float Active<Relu>(const float &x) {
return std::max(x, 0.f);
}
template <>
inline float Active<Relu6>(const float &x) {
return std::min(std::max(x, 0.f), 6.f);
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -50,31 +50,31 @@ inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
inline int32x4_t vround_f32(float32x4_t r) {
return vcvtq_s32_f32(r);
inline int32x4_t vRoundq_f32(const float32x4_t &x) {
return vcvtq_s32_f32(x);
}
template <>
inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) {
inline int32x4_t vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(const float32x4_t &x) {
float32x4_t plus = vdupq_n_f32(0.5);
float32x4_t minus = vdupq_n_f32(-0.5);
float32x4_t zero = vdupq_n_f32(0);
uint32x4_t more_than_zero = vcgtq_f32(r, zero);
uint32x4_t more_than_zero = vcgtq_f32(x, zero);
float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
temp = vaddq_f32(r, temp);
temp = vaddq_f32(x, temp);
int32x4_t ret = vcvtq_s32_f32(temp);
return ret;
}
template <>
inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) {
inline int32x4_t vRoundq_f32<ROUND_NEAREST_TO_EVEN>(const float32x4_t &x) {
float32x4_t point5 = vdupq_n_f32(0.5);
int32x4_t one = vdupq_n_s32(1);
int32x4_t zero = vdupq_n_s32(0);
int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r);
int32x4_t rnd = math::vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(x);
float32x4_t frnd = vcvtq_f32_s32(rnd);
frnd = vsubq_f32(frnd, r);
frnd = vsubq_f32(frnd, x);
frnd = vabsq_f32(frnd);
uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
int32x4_t abs_rnd = vabsq_s32(rnd);
......
......@@ -2530,7 +2530,7 @@ class QuantizeParam : public OpParam {
// scale = max(abs(x))
online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
// offline
if (OpParam::HasAttr("InScale", attrs)) {
if (inputs.count("InScale")) {
offline_ = true;
offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
}
......@@ -2566,9 +2566,7 @@ class DequantizeParam : public OpParam {
DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = InputXFrom<GType>(inputs, scope);
if (outputs.count("Out")) {
output_ = OutFrom<GType>(outputs, scope);
}
output_ = OutFrom<GType>(outputs, scope);
activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, scope);
// dequantization is performed as x = x / static_scale / online_scale
if (OpParam::HasAttr("weight_scale", attrs)) {
......@@ -2588,10 +2586,10 @@ class DequantizeParam : public OpParam {
};
#endif
#if defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_BN_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template <typename Dtype>
class FusionDequantBNParam : public DequantizeParam<Dtype> {
......@@ -2609,10 +2607,6 @@ class FusionDequantBNParam : public DequantizeParam<Dtype> {
bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, scope);
bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, scope);
epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
// output
if (outputs.count("Y")) {
this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
}
}
public:
......@@ -2642,10 +2636,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
// element wise add params
axis_ = OpParam::GetAttr<int>("axis", attrs);
bias_ = OpParam::InputYFrom<GType>(inputs, scope);
// output
if (outputs.count("Y")) {
this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
}
}
public:
......@@ -2655,44 +2645,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
};
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename Dtype>
class FusionDequantBNReluParam : public FusionDequantBNParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDequantBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
// output
if (outputs.count("Out")) {
this->output_ = OpParam::OutFrom<GType>(outputs, scope);
}
}
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <typename Dtype>
class FusionDequantAddBNReluParam : public FusionDequantAddBNParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDequantAddBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
// output
if (outputs.count("Out")) {
this->output_ = OpParam::OutFrom<GType>(outputs, scope);
}
}
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template <typename Dtype>
class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
......@@ -2707,47 +2659,9 @@ class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
// scale output
online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
// offline
if (OpParam::HasAttr("static_scale", attrs)) {
is_static_ = true;
static_scale_ = OpParam::GetAttr<float>("static_scale", attrs);
}
// x = round(scale * x)
if (OpParam::HasAttr("round_type", attrs)) {
round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
}
}
public:
RType *online_scale_;
// if static scale or not
bool is_static_ = false;
// quantize scale
float static_scale_ = 1.0f;
// round method type
// RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template <typename Dtype>
class FusionDequantAddBNReluQuantParam
: public FusionDequantAddBNReluParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDequantAddBNReluQuantParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs,
const Scope &scope)
: FusionDequantAddBNReluParam<Dtype>(inputs, outputs, attrs, scope) {
// scale output
online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
// offline
if (OpParam::HasAttr("static_scale", attrs)) {
is_static_ = true;
static_scale_ = OpParam::GetAttr<float>("static_scale", attrs);
if (inputs.count("InScale")) {
offline_ = true;
offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
}
// x = round(scale * x)
if (OpParam::HasAttr("round_type", attrs)) {
......@@ -2757,10 +2671,10 @@ class FusionDequantAddBNReluQuantParam
public:
RType *online_scale_;
// if static scale or not
bool is_static_ = false;
// quantize scale
float static_scale_ = 1.0f;
// quantize offline scale
RType *offline_scale_;
// if offine scale or not
bool offline_ = false;
// round method type
// RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
......
......@@ -129,7 +129,8 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
}
template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
int groups) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
......@@ -147,7 +148,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w});
framework::DDim filter_shape =
framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
VariableNameMap inputs;
VariableNameMap outputs;
......@@ -164,13 +165,22 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
auto filter = filter_var->template GetMutable<framework::LoDTensor>();
SetupTensor<Itype>(filter, filter_shape, -20, 20);
for (int i = 0; i < input->numel(); ++i) {
DLOG << "input[" << i
<< "] = " << static_cast<int>(input->data<int8_t>()[i]);
}
for (int i = 0; i < filter->numel(); ++i) {
DLOG << "filter[" << i
<< "] = " << static_cast<int>(filter->data<int8_t>()[i]);
}
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["dilations"].Set<vector<int>>(
std::vector<int>({dilation_h, dilation_w}));
attrs["groups"].Set<int>(1);
attrs["groups"].Set<int>(groups);
auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
scope);
......@@ -204,15 +214,15 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
Otype *output_cmp_data = output_cmp.data<Otype>();
for (int i = 0; i < output->numel(); ++i) {
float gap = output_data[i] - output_cmp_data[i];
PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
"output[%d] = %d, output_cmp[%d] = %d", i,
output_data[i], i, output_cmp_data[i]);
// if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
// LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
// << ", output_cmp_data[" << i << "] = " <<
// output_cmp_data[i];
// return 1;
// }
// PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
// "output[%d] = %d, output_cmp[%d] = %d", i,
// output_data[i], i, output_cmp_data[i]);
if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
<< ", output_cmp_data[" << i
<< "] = " << output_cmp_data[i];
exit(1);
}
}
delete op;
return 0;
......@@ -224,7 +234,8 @@ int main(int argc, char *argv[]) {
if (argc < 5) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-int8-conv-op in_channels in_height in_width out_channels\n"
<< " ./test-int8-conv-op in_channels in_height in_width out_channels "
"[groups]\n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
......@@ -236,72 +247,134 @@ int main(int argc, char *argv[]) {
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
int out_channels = atoi(argv[4]);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 5, stride = 3
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels, in_height,
in_width, out_channels);
// kernel = 7, pad = 3, stride = 4
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels, in_height,
in_width, out_channels);
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels, in_height,
in_width, out_channels);
int groups = 1;
if (argc == 6) {
groups = atoi(argv[5]);
}
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels, in_height,
in_width, out_channels);
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels, in_height,
in_width, out_channels);
// kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
in_width, out_channels);
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 1>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 5, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=1";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 1>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 2>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 2>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 2, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 2>(
in_channels, in_height, in_width, out_channels, groups);
// kernel = 3, pad = 5, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=2";
paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 2>(
in_channels, in_height, in_width, out_channels, groups);
// // kernel = 7, pad = 0, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 1, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 5, stride = 3
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 4
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
// paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 3, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
// paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 5, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 5, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
// paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 5, pad = 2, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 5, pad = 2, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
// paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
}
......@@ -249,6 +249,7 @@ if(NOT FOUND_MATCH)
set(SUM_OP ON)
set(QUANT_OP ON)
set(DEQUANT_OP ON)
set(FUSION_DEQUANT_BN_OP ON)
set(FUSION_DEQUANT_ADD_BN_OP ON)
set(FUSION_DEQUANT_BN_RELU_OP ON)
set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
......@@ -455,6 +456,9 @@ endif()
if (DEQUANT_OP)
add_definitions(-DDEQUANT_OP)
endif()
if (FUSION_DEQUANT_BN_OP)
add_definitions(-DFUSION_DEQUANT_BN_OP)
endif()
if (FUSION_DEQUANT_ADD_BN_OP)
add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册