未验证 提交 e2636a12 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1344 from hjchen2/dev-latest

Refactor pooling and dequant fusion implementation, fix some code style
......@@ -16,9 +16,9 @@ limitations under the License. */
#ifdef ENABLE_EXCEPTION
#include <stdio.h>
#include <stdlib.h>
#include <exception>
#include <string>
#endif
namespace paddle_mobile {
......
......@@ -24,7 +24,6 @@ const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
......@@ -32,7 +31,6 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul";
......@@ -41,6 +39,7 @@ const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu";
const char *G_OP_TYPE_RELU6 = "relu6";
const char *G_OP_TYPE_RESHAPE = "reshape";
const char *G_OP_TYPE_RESHAPE2 = "reshape2";
const char *G_OP_TYPE_SIGMOID = "sigmoid";
......@@ -73,9 +72,14 @@ const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
"fusion_dequant_add_bn_quant";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
"fusion_dequant_add_bn_relu_quant";
const char *G_OP_TYPE_TANH = "tanh";
const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
......@@ -91,6 +95,7 @@ std::unordered_map<
{G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
......@@ -112,13 +117,11 @@ std::unordered_map<
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
......@@ -142,9 +145,14 @@ std::unordered_map<
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
{{"X", "Scale"}, {"Out", "OutScale"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
{{"X", "Scale"}, {"Out", "OutScale"}}},
{G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
......
......@@ -87,10 +87,24 @@ enum PMStatus {
};
enum RoundType {
ROUND_UNK = 0,
ROUND_NEAREST_AWAY_ZERO = 1,
ROUND_NEAREST_TOWARDS_ZERO = 2,
ROUND_NEAREST_TO_EVEN = 3
ROUND_NEAREST_AWAY_ZERO = 0,
ROUND_NEAREST_TOWARDS_ZERO = 1,
ROUND_NEAREST_TO_EVEN = 2,
};
enum ActivationType {
IDENTITY = 0,
RELU = 1,
RELU6 = 2,
PRELU = 3,
LEAKY_RELU = 4,
TANH = 5,
SIGMOID = 6,
};
enum PoolingType {
MAX = 0,
AVG = 1,
};
extern const char *G_OP_TYPE_CONV;
......@@ -99,11 +113,9 @@ extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FC_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
......@@ -116,6 +128,7 @@ extern const char *G_OP_TYPE_MULTICLASS_NMS;
extern const char *G_OP_TYPE_POOL2D;
extern const char *G_OP_TYPE_PRIOR_BOX;
extern const char *G_OP_TYPE_RELU;
extern const char *G_OP_TYPE_RELU6;
extern const char *G_OP_TYPE_RESHAPE;
extern const char *G_OP_TYPE_SIGMOID;
extern const char *G_OP_TYPE_SOFTMAX;
......@@ -140,9 +153,12 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;
extern const char *G_OP_TYPE_TANH;
extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
......
......@@ -302,7 +302,15 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost;
if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
"Filter", inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
_tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
} else {
_tp[ops[i]->Type()] += timeCost;
}
}
printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>;
......@@ -372,6 +380,14 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto input_keys = ops[i]->GetInputKeys();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
input_keys[1], inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
printf("kernel size: %d\n", kernel_size);
}
_tp[ops[i]->Type()] += timeCost;
}
printf("====================[ profile ]======================\n");
......
......@@ -191,6 +191,7 @@ LOAD_OP2(mul, CPU, MALI_GPU);
#endif
#ifdef RELU_OP
LOAD_OP2(relu, CPU, MALI_GPU);
LOAD_OP1(relu6, CPU);
#endif
#ifdef IM2SEQUENCE_OP
LOAD_OP1(im2sequence, CPU);
......@@ -233,6 +234,10 @@ LOAD_OP1(quantize, CPU);
#ifdef DEQUANT_OP
LOAD_OP1(dequantize, CPU);
#endif
#ifdef FUSION_DEQUANT_BN_OP
LOAD_OP1(fusion_dequant_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_bn);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_OP
LOAD_OP1(fusion_dequant_add_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
......@@ -245,3 +250,11 @@ LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
#endif
......@@ -98,24 +98,6 @@ class OpRegistry {
}
};
#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
template class op_class<device_type, int8_t>; \
template <typename Dtype, typename T> \
class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \
}; \
static paddle_mobile::framework::OperatorRegistrar< \
device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
__op_registrar_##op_type##_##device_name(#op_type); \
int TouchOpRegistrar_##op_type##_##device_name() { \
__op_registrar_##op_type##_##device_name.Touch(); \
return 0; \
}
#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template class op_class<device_type, float>; \
template <typename Dtype, typename T> \
......
......@@ -220,7 +220,16 @@ void Node::Folder(
}
} else {
for (auto &op_output : this->op_desc_->outputs_) {
op_desc->outputs_.emplace(op_output.first, op_output.second);
auto output_key = op_output.first;
if (change->find(this->type_) != change->end()) {
const auto change_pairs = (*change)[this->type_];
for (const auto &target : change_pairs) {
if (target.first == output_key) {
output_key = target.second;
}
}
}
op_desc->outputs_.emplace(output_key, op_output.second);
}
for (auto &output : this->outputs_) {
......
......@@ -143,12 +143,10 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
int t1 = 1;
int t2 = 1;
for (int i = 0; i < m * k; ++i) {
unsigned int seed = 100;
a[i] = t1 + rand_r(&seed) % t2;
a[i] = t1 + rand() % t2; // NOLINT
}
for (int i = 0; i < k * n; ++i) {
unsigned int seed = 200;
b[i] = t1 + rand_r(&seed) % t2;
b[i] = t1 + rand() % t2; // NOLINT
}
paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include "operators/fusion_conv_add_relu_int8_op.h"
#include <vector>
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
ops::FusionConvAddReluInt8Op);
#endif
#endif // FUSION_CONVADDRELU_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_add_bn_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
template <typename DeviceType, typename T>
class FusionDequantAddBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>,
DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
public:
FusionDequantAddBNReluOp(const std::string &type,
......@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>,
DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
......
......@@ -12,50 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_dequant_add_bn_relu_quant_op.h"
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionFcInt8Op<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
auto y_dims = this->param_.InputY()->dims();
int x_num_col_dims = this->param_.XNumColDims();
int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
assert(x_mat_dims[1] == y_mat_dims[0]);
} // namespace operators
} // namespace paddle_mobile
std::vector<int64_t> output_dims;
output_dims.reserve(
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
ops::FusionDequantAddBNReluQuantMatcher);
for (int i = 0; i < x_num_col_dims; ++i) {
output_dims.push_back(x_dims[i]);
}
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
ops::FusionDequantAddBNReluQuantOp);
#endif
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
output_dims.push_back(y_dims[i]);
}
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
namespace paddle_mobile {
namespace operators {
framework::DDim ddim = framework::make_ddim(output_dims);
this->param_.Out()->Resize(ddim);
template <typename Dtype, typename T>
void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
ops::FusionDequantAddBNQuantMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
ops::FusionDequantAddBNQuantOp);
#endif
#endif // FUSION_FC_INT8_OP
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
public:
FusionDequantAddBNReluQuantMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
};
template <typename DeviceType, typename T>
class FusionDequantAddBNReluQuantOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
public:
FusionDequantAddBNReluQuantOp(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
public:
FusionDequantAddBNQuantMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
};
template <typename DeviceType, typename T>
class FusionDequantAddBNQuantOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
public:
FusionDequantAddBNQuantOp(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
} // namespace operators
} // namespace paddle_mobile
......@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_BN_RELU_OP
#include "operators/fusion_dequant_bn_relu_op.h"
#include "operators/fusion_dequant_bn_op.h"
namespace paddle_mobile {
namespace operators {
#ifdef FUSION_DEQUANT_BN_OP
template <typename Dtype, typename T>
void FusionDequantBNOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename Dtype, typename T>
void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef FUSION_DEQUANT_BN_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
ops::FusionDequantBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
#endif
#endif
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_RELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
class FusionDequantBNMatcher : public framework::FusionOpMatcher {
public:
FusionDequantBNMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
virtual void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
};
#endif // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_BN_OP
template <typename DeviceType, typename T>
class FusionDequantBNOp : public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>> {
public:
FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
public:
FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDequantBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>> {
public:
FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
......@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"}}}},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionFcInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>> {
public:
FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
......@@ -32,20 +32,6 @@ void ConvAddReluKernel<CPU, float>::Compute(
}
template class ConvAddReluKernel<CPU, float>;
#ifdef FUSION_CONVADDRELU_INT8_OP
template <>
bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddReluKernel<CPU, int8_t>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<int8_t, int32_t>(param);
}
template class ConvAddReluKernel<CPU, int8_t>;
#endif
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "operators/kernel/dequant_add_bn_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
// elementwise add params
const Tensor *bias = param->bias_;
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *bias_ptr = bias->data<float>();
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
}
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
const int32_t *input = param.input_->data<int32_t>();
const float *bn_scale = param.bn_scale_->data<float>();
const float *bn_bias = param.bn_bias_->data<float>();
// dequantize params
const float activation_scale = param.activation_scale_->data<float>()[0];
const float weight_scale = param.weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param.output_->mutable_data<float>();
int batch_size = param.input_->dims()[0];
int channels = param.input_->dims()[1];
size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
float scale = bn_scale[c] * dequant_scale;
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = scale * x[k] + bias;
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DEQUANT_ADD_BN_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param->output_->mutable_data<float>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
float scale = bn_scale[c] * dequant_scale;
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __zero = vdupq_n_f32(0.f);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = vmaxq_f32(__zero, f0);
f1 = vmaxq_f32(__zero, f1);
f2 = vmaxq_f32(__zero, f2);
f3 = vmaxq_f32(__zero, f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = std::max(scale * x[k] + bias, 0.f);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <>
bool FusionDequantBNReluKernel<CPU, float>::Init(
FusionDequantBNReluParam<CPU> *param) {
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c];
}
return true;
}
template <>
void FusionDequantBNReluKernel<CPU, float>::Compute(
const FusionDequantBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <>
bool FusionDequantAddBNReluKernel<CPU, float>::Init(
FusionDequantAddBNReluParam<CPU> *param) {
// elementwise add params
const Tensor *bias = param->bias_;
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *bias_ptr = bias->data<float>();
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
}
return true;
}
template <>
void FusionDequantAddBNReluKernel<CPU, float>::Compute(
const FusionDequantAddBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/math/activation.h"
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
const framework::Tensor *bias) {
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
float val = bias ? bias->data<float>()[c] : 0;
bn_bias_ptr[c] =
inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
}
}
#endif
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
template <ActivationType Act>
void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param->output_->mutable_data<float>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = math::vActiveq_f32<Act>(f0);
f1 = math::vActiveq_f32<Act>(f1);
f2 = math::vActiveq_f32<Act>(f2);
f3 = math::vActiveq_f32<Act>(f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_OP
template <>
bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNKernel<CPU, float>::Compute(
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<IDENTITY>(&param);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <>
bool FusionDequantBNReluKernel<CPU, float>::Init(
FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNReluKernel<CPU, float>::Compute(
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<RELU>(&param);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_OP
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<IDENTITY>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <>
bool FusionDequantAddBNReluKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNReluKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<RELU>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template <Activation Act, RoundType R>
void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
// quantize params
Tensor *output_scale = param->online_scale_;
float max_abs = 0.f;
int8_t *output = param->output_->mutable_data<int8_t>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
// if (param->is_static_) {
if (true) {
max_abs = param->static_scale_;
float quant_scale = 127.f / max_abs;
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
int8_t *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = math::vActiveq_f32<Act>(f0);
f1 = math::vActiveq_f32<Act>(f1);
f2 = math::vActiveq_f32<Act>(f2);
f3 = math::vActiveq_f32<Act>(f3);
f0 = vmulq_f32(__quant_scale, f0);
f1 = vmulq_f32(__quant_scale, f1);
f2 = vmulq_f32(__quant_scale, f2);
f3 = vmulq_f32(__quant_scale, f3);
int32x4_t q0 = math::vRoundq_f32<R>(f0);
int32x4_t q1 = math::vRoundq_f32<R>(f1);
int32x4_t q2 = math::vRoundq_f32<R>(f2);
int32x4_t q3 = math::vRoundq_f32<R>(f3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
float x_temp =
math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
y[k] = math::Round<R>(x_temp * quant_scale);
}
}
}
} else {
// TODO(hjchen2)
max_abs = std::max(max_abs, 1e-6f);
}
param->online_scale_->mutable_data<float>()[0] = max_abs;
}
template <>
bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
FusionDequantAddBNQuantParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
}
}
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template <>
bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
FusionDequantAddBNQuantParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
}
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
} // namespace operators
} // namespace paddle_mobile
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/quantize_kernel.h"
#include <cmath>
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
......@@ -32,81 +33,68 @@ inline float32_t vmaxvq_f32(float32x4_t r) {
}
#endif
template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
inline int32x4_t vround_f32(float32x4_t r) {
return vcvtq_s32_f32(r);
}
template <>
inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) {
float32x4_t plus = vdupq_n_f32(0.5);
float32x4_t minus = vdupq_n_f32(-0.5);
float32x4_t zero = vdupq_n_f32(0);
uint32x4_t more_than_zero = vcgtq_f32(r, zero);
float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
temp = vaddq_f32(r, temp);
int32x4_t ret = vcvtq_s32_f32(temp);
return ret;
}
template <>
inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) {
float32x4_t point5 = vdupq_n_f32(0.5);
int32x4_t one = vdupq_n_s32(1);
int32x4_t zero = vdupq_n_s32(0);
int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r);
float32x4_t frnd = vcvtq_f32_s32(rnd);
frnd = vsubq_f32(frnd, r);
frnd = vabsq_f32(frnd);
uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
int32x4_t abs_rnd = vabsq_s32(rnd);
abs_rnd = vandq_s32(abs_rnd, one);
uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
mask = veorq_u32(more_than_zero, mask);
more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
mask = vaddq_u32(more_than_zero, mask);
int32x4_t smask = vreinterpretq_s32_u32(mask);
smask = vsubq_s32(smask, one);
rnd = vaddq_s32(rnd, smask);
return rnd;
}
template <RoundType R>
inline void QuantizeOffline(const Tensor *input, const float scale,
const float max_abs, Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = remain >> 4;
remain = remain & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __postive_max = vdupq_n_f32(max_abs);
float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4);
int8_t *local_y = y + (i << 4);
float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
r0 = vmulq_f32(r0, __scale);
r1 = vmulq_f32(r1, __scale);
r2 = vmulq_f32(r2, __scale);
r3 = vmulq_f32(r3, __scale);
int32x4_t q0 = math::vRoundq_f32<R>(r0);
int32x4_t q1 = math::vRoundq_f32<R>(r1);
int32x4_t q2 = math::vRoundq_f32<R>(r2);
int32x4_t q3 = math::vRoundq_f32<R>(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(local_y, d5);
vst1_s8(local_y + 8, d6);
}
x += (loop << 4);
y += (loop << 4);
#endif
template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
inline int8_t Round(const float &x) {
return static_cast<int8_t>(x);
}
template <>
inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
return std::round(x);
}
template <>
inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
float v = std::round(x);
int32_t q = static_cast<int32_t>(v);
if (std::abs(std::abs(q - v) - 0.5) <= 0) {
if (std::abs(q) % 2 != 0) {
q = q + ((q > 0) ? -1 : 1);
}
for (size_t i = 0; i < remain; ++i) {
float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
y[i] = math::Round<R>(x_temp * scale);
}
return static_cast<int8_t>(q);
}
template <RoundType R>
static void Quantize(const Tensor *input, const float scale, Tensor *output) {
inline void QuantizeOnline(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = remain >> 4;
remain = remain & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4);
......@@ -115,14 +103,14 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale);
r3 = vmulq_n_f32(r3, scale);
int32x4_t q0 = vround_f32<R>(r0);
int32x4_t q1 = vround_f32<R>(r1);
int32x4_t q2 = vround_f32<R>(r2);
int32x4_t q3 = vround_f32<R>(r3);
r0 = vmulq_f32(r0, __scale);
r1 = vmulq_f32(r1, __scale);
r2 = vmulq_f32(r2, __scale);
r3 = vmulq_f32(r3, __scale);
int32x4_t q0 = math::vRoundq_f32<R>(r0);
int32x4_t q1 = math::vRoundq_f32<R>(r1);
int32x4_t q2 = math::vRoundq_f32<R>(r2);
int32x4_t q3 = math::vRoundq_f32<R>(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
......@@ -138,7 +126,18 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
y += (loop << 4);
#endif
for (size_t i = 0; i < remain; ++i) {
y[i] = Round<R>(x[i] * scale);
y[i] = math::Round<R>(x[i] * scale);
}
}
template <RoundType R>
static void Quantize(const Tensor *input, const float max_abs,
const bool offline, Tensor *output) {
float scale = 127.f / max_abs;
if (offline) {
QuantizeOffline<R>(input, scale, max_abs, output);
} else {
QuantizeOnline<R>(input, scale, output);
}
}
......@@ -173,6 +172,13 @@ float find_abs_max(const Tensor *input) {
return max_abs;
}
} // namespace operators
} // namespace paddle_mobile
#endif // __ARM_NEON__
namespace paddle_mobile {
namespace operators {
template <>
bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
return true;
......@@ -184,24 +190,23 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
Tensor *output = param.output_;
Tensor *output_scale = param.online_scale_;
float max_abs = 0.f;
if (param.is_static_) {
max_abs = param.static_scale_;
if (param.offline_) {
max_abs = param.offline_scale_->data<float>()[0];
} else {
max_abs = find_abs_max(input);
}
max_abs = std::max(max_abs, 1e-6f);
// only support int8 currently
float scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = max_abs;
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
output);
break;
case ROUND_NEAREST_AWAY_ZERO:
Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
......@@ -212,4 +217,4 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
} // namespace operators
} // namespace paddle_mobile
#endif
#endif // QUANT_OP
......@@ -15,11 +15,56 @@ limitations under the License. */
#ifdef RELU_OP
#include "operators/kernel/relu_kernel.h"
#include "operators/kernel/central-arm-func/relu_arm_func.h"
#include "common/types.h"
#include "operators/math/activation.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <typename Dtype, ActivationType Act>
struct ReluCompute {
void operator()(const Tensor *input, Tensor *output) {}
};
template <ActivationType Act>
struct ReluCompute<float, Act> {
void operator()(const Tensor *input, Tensor *output) {
const float *x = input->data<float>();
float *y = output->mutable_data<float>();
size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = remain >> 4;
remain = remain & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4);
float *local_y = y + (i << 4);
float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = math::vActiveq_f32<Act>(r0);
r1 = math::vActiveq_f32<Act>(r1);
r2 = math::vActiveq_f32<Act>(r2);
r3 = math::vActiveq_f32<Act>(r3);
vst1q_f32(local_y, r0);
vst1q_f32(local_y + 4, r1);
vst1q_f32(local_y + 8, r2);
vst1q_f32(local_y + 12, r3);
}
x += (loop << 4);
y += (loop << 4);
#endif
for (size_t i = 0; i < remain; ++i) {
y[i] = math::Active<Act>(x[i]);
}
}
};
template <>
bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
return true;
......@@ -27,7 +72,21 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
template <>
void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
ReluCompute<float>(param);
const Tensor *input = param.InputX();
Tensor *output = param.Out();
ReluCompute<float, RELU>()(input, output);
}
template <>
bool Relu6Kernel<CPU, float>::Init(ReluParam<CPU> *param) {
return true;
}
template <>
void Relu6Kernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
const Tensor *input = param.InputX();
Tensor *output = param.Out();
ReluCompute<float, RELU6>()(input, output);
}
} // namespace operators
......
......@@ -11,14 +11,111 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace paddle_mobile {
namespace operators {
bool IsShuffleChannel(const std::vector<int> &axis) {
bool is_shuffle_channel = true;
if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
for (int i = 3; i < axis.size(); ++i) {
if (axis[i] != i) {
is_shuffle_channel = false;
break;
}
}
} else {
return false;
}
return is_shuffle_channel;
}
template <typename Dtype>
void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
const std::vector<int> &axis = param.Axis();
const Tensor *input = param.InputX();
const Dtype *input_ptr = input->data<Dtype>();
Tensor *output = param.Out();
Dtype *output_ptr = output->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const framework::DDim &in_dim = input->dims();
const framework::DDim &out_dim = output->dims();
size_t offset = 1;
for (int i = 3; i < axis.size(); ++i) {
offset *= in_dim[i];
}
#pragma omp parallel for collapse(3)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int c1 = 0; c1 < out_dim[1]; ++c1) {
for (int c2 = 0; c2 < out_dim[2]; ++c2) {
size_t out_offset =
((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
memcpy(output_ptr + out_offset, input_ptr + in_offset,
offset * sizeof(Dtype));
}
}
}
}
template <typename Dtype>
void Transpose2Compute(const Transpose2Param<CPU> &param) {
const std::vector<int> &axis = param.Axis();
const Tensor *input = param.InputX();
const Dtype *input_ptr = input->data<Dtype>();
Tensor *output = param.Out();
Dtype *output_ptr = output->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const framework::DDim &in_dim = input->dims();
const framework::DDim &out_dim = output->dims();
// precompute inverted output dim and strides
size_t rout_dim[6], strides[6];
int permute = axis.size(); // permute must >=2 && <= 6.
for (int i = 0; i < permute; ++i) {
int k = permute - 1 - i;
strides[k] = 1;
for (int j = axis[i] + 1; j < permute; ++j) {
strides[k] *= in_dim[j];
}
rout_dim[k] = out_dim[i];
}
// unroll the first 2 dimensions
int reamin_dim = 1;
for (int i = 2; i < out_dim.size(); ++i) {
reamin_dim *= out_dim[i];
}
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int j = 0; j < out_dim[1]; ++j) {
size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
int indics[4] = {0, 0, 0, 0};
for (int k = 0; k < reamin_dim; ++k) {
out_ptr[k] = input_ptr[offset];
indics[0] += 1;
offset += strides[0];
for (int p = 0; p < permute - 3; ++p) {
if (indics[p] == rout_dim[p]) {
indics[p + 1] += 1;
indics[p] = 0;
offset += strides[p + 1];
offset -= rout_dim[p] * strides[p];
} else {
break;
}
}
}
}
}
}
template <>
bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
return true;
......@@ -26,10 +123,24 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
template <>
void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
Transpose2Compute<float>(param);
const std::vector<int> &axis = param.Axis();
bool shuffle_channel = IsShuffleChannel(axis);
if (shuffle_channel) {
if (param.InputX()->type() == typeid(int8_t)) {
ShuffleChannelCompute<int8_t>(param);
} else {
ShuffleChannelCompute<float>(param);
}
} else {
if (param.InputX()->type() == typeid(int8_t)) {
Transpose2Compute<int8_t>(param);
} else {
Transpose2Compute<float>(param);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
#endif // TRANSPOSE2_OP
......@@ -25,6 +25,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
......@@ -106,9 +107,9 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data);
math::matmul<float, float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data);
}
}
}
......
......@@ -25,24 +25,18 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <typename P, typename S>
template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int32_t axis = param.Axis();
S *bias_data = bias.data<S>();
Otype *bias_data = bias.data<Otype>();
Tensor *output = param.Output();
output->mutable_data<P>();
output->mutable_data<Otype>();
float alpha = 1.0f;
float beta = 1.0f;
#ifdef FUSION_CONVADDRELU_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
#endif
int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
......@@ -70,7 +64,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<P>(col_shape);
col.mutable_data<Itype>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
......@@ -89,8 +83,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, P> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
math::Vol2ColFunctor<CPU, Itype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
......@@ -118,8 +112,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
beta, true, bias_data);
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
&out_slice, beta, true, bias_data);
}
}
}
......
......@@ -106,9 +106,10 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0),
false, static_cast<Otype *>(nullptr));
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0), false,
static_cast<Otype *>(nullptr));
}
}
}
......
......@@ -93,8 +93,8 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0),
&col_matrix, static_cast<P>(0.0));
math::matmul<P, P>(filter_slice, true, in_slice, false,
static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
if (data_dim == 2U) {
col2im(col, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
......
......@@ -23,20 +23,15 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <typename P, typename S>
template <typename Itype, typename Otype>
void FusionFcCompute(const FusionFcParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *input_z = param.InputZ();
S *input_z_data = input_z->data<S>();
Otype *input_z_data = input_z->data<Otype>();
int axis = param.Axis();
Tensor *out = param.Out();
// int m = out->dims()[0];
// int n = out->dims()[1];
auto *out_data = out->mutable_data<P>();
float alpha = 1.0f;
float beta = 1.0f;
auto *out_data = out->mutable_data<Itype>();
const Tensor x_matrix =
input_x->dims().size() > 2
......@@ -57,28 +52,14 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
if (std::is_same<P, int8_t>::value) {
#ifdef FUSION_FC_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
input_z_data, true);
#endif
} else {
// bias_data的维度和out的第二个维度一致
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data,
sizeof(float) * classes);
}
math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
false);
// bias_data的维度和out的第二个维度一致
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
math::matmul<Itype, Otype>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1),
false);
}
} // namespace operators
......
......@@ -73,14 +73,14 @@ void MulCompute(const MulParam<CPU> &param) {
}
if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>();
math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out,
static_cast<float>(0));
math::matmul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out,
static_cast<float>(0));
} else {
out->mutable_data<float>();
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
math::matmul<float, float>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out,
static_cast<float>(0));
}
if (out_dim.size() != 2) {
out->Resize(out_dim);
......
......@@ -17,103 +17,53 @@ limitations under the License. */
#include <string>
#include <vector>
#include "common/types.h"
#include "operators/math/pooling.h"
namespace paddle_mobile {
namespace operators {
using framework::Tensor;
template <typename T, typename S>
void PoolBasic(std::string pooling_type, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings,
const Tensor *in_x, Tensor *out) {
if (pooling_type == "max") {
math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
math::MaxPool<T> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
} else if (pooling_type == "avg") {
math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
math::AvgPool<T, S> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
}
}
template <typename P>
void PoolCompute(const PoolParam<CPU> &param) {
const Tensor *in_x = param.Input();
Tensor *out = param.Output();
std::string pooling_type = param.PoolingType();
const framework::Tensor *input = param.Input();
framework::Tensor *output = param.Output();
const std::string &pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
if (ksize.size() != 2) {
LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "Pool op only supports 2D and 3D input.";
}
if (param.isGlobalPooling()) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
ksize[i] = static_cast<int>(input->dims()[i + 2]);
}
}
if (in_x->type() == typeid(int8_t)) {
if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
if (strides[0] == strides[1] && strides[0] == 1) {
math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
} else if (strides[0] == strides[1] && strides[0] == 2) {
math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max" && strides[0] == strides[1]) {
if (strides[0] == 1) {
math::Pooling3x3<MAX, 1>()(*input, paddings, output);
} else if (strides[0] == 2) {
math::Pooling3x3<MAX, 2>()(*input, paddings, output);
} else {
math::Pool3x3Max_int8(strides, paddings, in_x, out);
math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
}
} else if (pooling_type == "avg" && strides[0] == strides[1]) {
if (strides[0] == 1) {
math::Pooling3x3<AVG, 1>()(*input, paddings, output);
} else if (strides[0] == 2) {
math::Pooling3x3<AVG, 2>()(*input, paddings, output);
} else {
math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
}
} else {
PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
out);
// Others
}
} else {
if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Maxs1p1(in_x, out);
} else {
math::Pool3x3Max(strides, paddings, in_x, out);
}
} else if (pooling_type == "avg") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Avgs1p1(in_x, out);
} else {
math::Pool3x3Avg(strides, paddings, in_x, out);
}
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
strides[0] == strides[1] && paddings[0] == paddings[1] &&
paddings[1] == 0) {
#if __ARM_NEON
#if __aarch64__
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#else
/// todo: fix bug in Pool2x2
if (pooling_type == "max") {
math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
}
#endif
#else
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#endif // __ARM_NEON
if (pooling_type == "max") {
math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
} else if (pooling_type == "avg") {
math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
} else {
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
// Others
}
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include <operators/math/transform.h>
#include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <typename T>
struct ReluFunctor {
inline T operator()(T in) const { return in > 0 ? in : 0; }
};
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
template <typename P>
void ReluCompute(const ReluParam<CPU> &param) {
const auto *input_x = param.InputX();
auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#if __aarch64__
if (numel > 0) {
int loop = numel >> 0x4;
int remain = numel & 0xF;
float32x4_t zero = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(input_x_ptr);
float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
r0 = vmaxq_f32(r0, zero);
r1 = vmaxq_f32(r1, zero);
r2 = vmaxq_f32(r2, zero);
r3 = vmaxq_f32(r3, zero);
vst1q_f32(out_ptr, r0);
vst1q_f32(out_ptr + 4, r1);
vst1q_f32(out_ptr + 8, r2);
vst1q_f32(out_ptr + 12, r3);
input_x_ptr += 16;
out_ptr += 16;
}
for (int i = 0; i < remain; ++i) {
out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
}
#else
if (numel > 64) {
asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
#endif
} else {
#endif
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
}
#endif
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TRANSPOSE2_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void Transpose2Compute(const Transpose2Param<CPU>& param) {
const auto* input_x = param.InputX();
const auto input_x_dims = input_x->dims();
auto* out = param.Out();
const auto axis = param.Axis();
const auto* input_x_data = input_x->data<float>();
auto* out_data = out->mutable_data<float>();
size_t ndim = axis.size();
std::vector<int> xdim(ndim);
std::vector<int> xstride(ndim);
std::vector<int> xout(ndim);
for (int i = 0; i < ndim; i++) {
int j = ndim - 1 - i;
xdim[j] = input_x_dims[axis[i]];
xstride[j] = 1;
for (int k = axis[i] + 1; k < ndim; k++) {
xstride[j] *= input_x_dims[k];
}
xout[j] = xstride[j] * xdim[j];
}
auto numel = input_x->numel();
size_t pind = 0;
std::vector<int> ind(ndim);
for (int i = 0; i < numel; i++) {
out_data[i] = input_x_data[pind];
ind[0]++;
pind += xstride[0];
for (int j = 0; j < ndim - 1; j++) {
if (ind[j] == xdim[j]) {
ind[j + 1]++;
ind[j] = 0;
pind += xstride[j + 1];
pind -= xout[j];
} else {
break;
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -21,23 +21,6 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
// vector<int> pos;
// template <typename T>
// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
// const vector<int> old_strides, const vector<int>
// new_strides, T* output) {
// for (int i = 0; i < numel; ++i) {
// int old_idx = 0;
// int idx = i;
// for (int j = 0; j < axis.size(); ++j) {
// int order = axis[j];
// old_idx += (idx / new_strides[j]) * old_strides[order];
// idx %= new_strides[j];
// }
// output[i] = input[old_idx];
// }
// }
template <typename P>
void TransposeCompute(const TransposeParam<CPU>& param) {
const auto* input_x = param.InputX();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionDequantAddBNKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNParam<DeviceType> &param);
bool Init(FusionDequantAddBNParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -20,26 +20,37 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
#define DECLARE_KERNEL(KernelClass, KernelParam) \
template <typename DeviceType, typename T> \
class KernelClass \
: public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
public: \
bool Init(KernelParam<DeviceType> *param); \
void Compute(const KernelParam<DeviceType> &param); \
};
#ifdef FUSION_DEQUANT_BN_OP
DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename DeviceType, typename T>
class FusionDequantBNReluKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantBNReluParam<DeviceType>> {
public:
void Compute(const FusionDequantBNReluParam<DeviceType> &param);
bool Init(FusionDequantBNReluParam<DeviceType> *param);
};
DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_OP
DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <typename DeviceType, typename T>
class FusionDequantAddBNReluKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNReluParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
};
DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
#endif
} // namespace operators
......
......@@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
......
......@@ -17,7 +17,6 @@ limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -30,6 +29,15 @@ class ReluKernel
void Compute(const ReluParam<DeviceType>& param);
bool Init(ReluParam<DeviceType>* param);
};
template <typename DeviceType, typename T>
class Relu6Kernel
: public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
public:
void Compute(const ReluParam<DeviceType>& param);
bool Init(ReluParam<DeviceType>* param);
};
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <algorithm>
#include <cmath>
#include <string>
#include "common/enforce.h"
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "operators/math/math_func_neon.h"
#endif
namespace paddle_mobile {
namespace operators {
namespace math {
......@@ -24,68 +32,92 @@ namespace math {
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};
inline ActivationType GetActivationType(const std::string &type) {
if (type == "sigmoid") {
return ActivationType::kSigmoid;
return ActivationType::SIGMOID;
} else if (type == "relu") {
return ActivationType::kReLU;
return ActivationType::RELU;
} else if (type == "tanh") {
return ActivationType::kTanh;
return ActivationType::TANH;
} else if (type == "identity" || type == "") {
return ActivationType::kIdentity;
return ActivationType::IDENTITY;
}
PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
}
namespace forward {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <ActivationType Act = IDENTITY>
inline float32x4_t vActiveq_f32(const float32x4_t &x) {
return x;
}
template <typename T>
T Identity(const T a) {
return a;
template <>
inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
return vmaxq_f32(x, __zero);
}
template <typename T>
T Relu(const T a) {
return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
template <>
inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
float32x4_t __zero = vdupq_n_f32(0.f);
float32x4_t __six = vdupq_n_f32(6.f);
return vminq_f32(vmaxq_f32(x, __zero), __six);
}
template <typename T>
T Sigmoid(const T a) {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
T tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
template <>
inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
float32x4_t __one = vdupq_n_f32(1.f);
float32x4_t __x = vnegq_f32(x);
__x = exp_ps(__x);
__x = vaddq_f32(__x, __one);
float32x4_t __out = vrecpeq_f32(__x);
return vmulq_f32(vrecpsq_f32(__x, __out), __out);
}
template <typename T>
T Tanh(const T a) {
T tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
template <>
inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
float32x4_t __one = vdupq_n_f32(1.f);
float32x4_t __x = vnegq_f32(x);
__x = vmulq_n_f32(__x, 2.f);
__x = exp_ps(__x);
__x = vaddq_f32(__x, __one);
float32x4_t __out = vrecpeq_f32(__x);
__out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
__out = vmulq_n_f32(__out, 2.f);
return vsubq_f32(__out, __one);
}
#endif
} // namespace forward
template <ActivationType Act = IDENTITY>
inline float Active(const float &x) {
return x;
}
template <typename T>
struct Active {
typedef T (*Act)(T);
};
template <>
inline float Active<RELU>(const float &x) {
return std::max(x, 0.f);
}
static Active<float>::Act kActFloat[] = {
&forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
&forward::Identity<float>};
template <>
inline float Active<RELU6>(const float &x) {
return std::min(std::max(x, 0.f), 6.f);
}
namespace forward {
inline float activation(float a, int index) { return kActFloat[index](a); }
template <>
inline float Active<SIGMOID>(const float &x) {
// float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
// tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
// return 1.f / (1.f + exp(-tmp));
return 1.f / (1.f + exp(-x));
}
} // namespace forward
template <>
inline float Active<TANH>(const float &x) {
// float tmp = -2.f * x;
// tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
// return (2.f / (1.f + exp(tmp))) - 1.f;
return 2.f / (1.f + exp(-2.f * x)) - 1.f;
}
} // namespace math
} // namespace operators
......
此差异已折叠。
......@@ -105,16 +105,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias);
*/
/*
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
......@@ -149,7 +148,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
......@@ -159,13 +157,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
/*
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
*new_scale, float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
......@@ -392,7 +391,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO()
// TODO(paddle mobile)
#else
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
#endif
......@@ -414,7 +413,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO()
// TODO(paddle mobile)
#else
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
#endif
......@@ -438,7 +437,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
// TODO(paddle mobile)
#else
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif
......@@ -468,7 +467,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
// TODO(paddle mobile)
#else
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
#endif
......
......@@ -11,13 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRU_OP
#include "operators/math/gru_compute.h"
#include "common/types.h"
#include "operators/math/activation_functions.h"
#include "operators/math/activation.h"
#include "operators/math/gemm.h"
#include "operators/math/gru_cpu_kernel.h"
#include "operators/math/gru_kernel.h"
namespace paddle_mobile {
namespace operators {
......@@ -43,8 +44,7 @@ struct GRUUnitFunctor<CPU, T> {
#endif
}
forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
batch_size, active_gate);
forward_reset_output(value, frame_size, batch_size, active_gate);
if (value.prev_out_value) {
#ifdef _OPENMP
......@@ -60,8 +60,7 @@ struct GRUUnitFunctor<CPU, T> {
#endif
}
forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
batch_size, active_node);
forward_final_output(value, frame_size, batch_size, active_node);
}
};
......
......@@ -11,7 +11,7 @@ limitations under the License. */
#ifdef GRU_OP
#pragma once
#include "operators/math/activation_functions.h"
#include "operators/math/activation.h"
namespace paddle_mobile {
namespace operators {
......
......@@ -11,21 +11,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRU_OP
#pragma once
#include <type_traits>
#include "operators/math/activation_functions.h"
#include "operators/math/activation.h"
#include "operators/math/gru_compute.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *gate_value, T *reset_output_value,
T *prev_output_value, int frame_size,
ActivationType active_gate) {
template <typename T, ActivationType Act>
void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
T *prev_output_value, int frame_size) {
T r_value_update_gate;
T r_value_reset_gate;
T r_value_reset_output;
......@@ -33,27 +34,57 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *update_gate = gate_value;
T *reset_gate = gate_value + frame_size;
for (int i = 0; i < frame_size; i++) {
int remain = frame_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain >> 3;
remain = remain & 0x7;
float32x4_t prev0 = vdupq_n_f32(0.f);
float32x4_t prev1 = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t update0 = vld1q_f32(update_gate);
float32x4_t update1 = vld1q_f32(update_gate + 4);
float32x4_t reset0 = vld1q_f32(reset_gate);
float32x4_t reset1 = vld1q_f32(reset_gate + 4);
if (prev_output_value) {
prev0 = vld1q_f32(prev_output_value);
prev1 = vld1q_f32(prev_output_value + 4);
prev_output_value += 8;
}
update0 = vActiveq_f32<Act>(update0);
update1 = vActiveq_f32<Act>(update1);
reset0 = vActiveq_f32<Act>(reset0);
reset1 = vActiveq_f32<Act>(reset1);
float32x4_t output0 = vmulq_f32(prev0, reset0);
float32x4_t output1 = vmulq_f32(prev1, reset1);
vst1q_f32(update_gate, update0);
vst1q_f32(update_gate + 4, update1);
vst1q_f32(reset_gate, reset0);
vst1q_f32(reset_gate + 4, reset1);
vst1q_f32(reset_output_value, output0);
vst1q_f32(reset_output_value + 4, output1);
update_gate += 8;
reset_gate += 8;
reset_output_value += 8;
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; i++) {
r_value_update_gate = update_gate[i];
r_value_reset_gate = reset_gate[i];
if (prev_output_value) {
r_prev_out = prev_output_value[i];
}
op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
&r_value_reset_output, active_gate);
r_value_update_gate = Active<Act>(r_value_update_gate);
r_value_reset_gate = Active<Act>(r_value_reset_gate);
r_value_reset_output = r_prev_out * r_value_reset_gate;
update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate;
reset_output_value[i] = r_value_reset_output;
}
}
template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *gate_value, T *prev_output_value,
T *output_value, int frame_size,
ActivationType active_node) {
template <typename T, ActivationType Act>
void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
T *output_value, int frame_size) {
T r_value_update_gate;
T r_value_frame_state;
T r_prev_out = 0;
......@@ -61,30 +92,73 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *update_gate = gate_value;
T *frame_state = gate_value + frame_size * 2;
for (int i = 0; i < frame_size; i++) {
int remain = frame_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain >> 3;
remain = remain & 0x7;
float32x4_t prev0 = vdupq_n_f32(0.f);
float32x4_t prev1 = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t update0 = vld1q_f32(update_gate);
float32x4_t update1 = vld1q_f32(update_gate + 4);
float32x4_t state0 = vld1q_f32(frame_state);
float32x4_t state1 = vld1q_f32(frame_state + 4);
if (prev_output_value) {
prev0 = vld1q_f32(prev_output_value);
prev1 = vld1q_f32(prev_output_value + 4);
prev_output_value += 8;
}
state0 = vActiveq_f32<Act>(state0);
state1 = vActiveq_f32<Act>(state1);
float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
output0 = vmlaq_f32(output0, update0, state0);
output1 = vmlaq_f32(output1, update1, state1);
vst1q_f32(frame_state, state0);
vst1q_f32(frame_state + 4, state1);
vst1q_f32(output_value, output0);
vst1q_f32(output_value + 4, output1);
update_gate += 8;
frame_state += 8;
output_value += 8;
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; i++) {
r_value_update_gate = update_gate[i];
r_value_frame_state = frame_state[i];
if (prev_output_value) {
r_prev_out = prev_output_value[i];
}
op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
&r_output, active_node);
r_value_frame_state = Active<Act>(r_value_frame_state);
r_output = r_prev_out - r_value_update_gate * r_prev_out +
r_value_update_gate * r_value_frame_state;
frame_state[i] = r_value_frame_state;
output_value[i] = r_output;
}
}
template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput op_reset_output,
GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_gate) {
for (int b = 0; b < batch_size; b++) {
hl_naive_gru_forward_reset_output(
op_reset_output, value.gate_value, value.reset_output_value,
value.prev_out_value, frame_size, active_gate);
#define FORWARD_RESET_OUTPUT(active_type, value, frame_size) \
hl_naive_gru_forward_reset_output<float, active_type>( \
value.gate_value, value.reset_output_value, value.prev_out_value, \
frame_size);
template <typename T>
inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; ++b) {
switch (active_node) {
case RELU:
FORWARD_RESET_OUTPUT(RELU, value, frame_size);
break;
case SIGMOID:
FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
break;
case TANH:
FORWARD_RESET_OUTPUT(TANH, value, frame_size);
break;
default:
FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
}
value.gate_value += frame_size * 3;
value.reset_output_value += frame_size;
if (value.prev_out_value) {
......@@ -93,15 +167,27 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
}
}
template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput op_final_output,
GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; b++) {
hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
value.prev_out_value, value.output_value,
frame_size, active_node);
#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
hl_naive_gru_forward_final_output<float, active_type>( \
value.gate_value, value.prev_out_value, value.output_value, frame_size)
template <typename T>
inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; ++b) {
switch (active_node) {
case RELU:
FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
break;
case SIGMOID:
FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
break;
case TANH:
FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
break;
default:
FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
}
value.gate_value += frame_size * 3;
value.output_value += frame_size;
if (value.prev_out_value) {
......@@ -113,4 +199,5 @@ inline void forward_final_output(OpFinalOutput op_final_output,
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRU_OP
#pragma once
#include <type_traits>
#include "operators/math/activation_functions.h"
namespace paddle_mobile {
namespace operators {
namespace math {
namespace forward {
template <typename T>
class gru_resetOutput {
public:
void operator()(T *value_update_gate, T *value_reset_gate, T *prev_out,
T *value_reset_output, ActivationType act_gate) {
*value_update_gate = activation(*value_update_gate, act_gate);
*value_reset_gate = activation(*value_reset_gate, act_gate);
*value_reset_output = (*prev_out) * (*value_reset_gate);
}
};
template <typename T>
class gru_finalOutput {
public:
void operator()(T *value_update_gate, T *value_frame_state, T *prev_out,
T *value_output, ActivationType act_input) {
*value_frame_state = activation(*value_frame_state, act_input);
*value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
((*value_update_gate) * (*value_frame_state));
}
};
} // namespace forward
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -41,10 +41,10 @@ void set_constant(framework::Tensor *tensor, float value) {
}
template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu,
float *bias) {
void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out,
float beta, bool relu, float *bias) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......
......@@ -24,24 +24,24 @@ namespace math {
void set_constant(framework::Tensor *tensor, float value);
template <typename T>
template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
float *bias = nullptr);
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu = false,
Otype *bias = nullptr);
template <typename T, typename S>
template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
S *bias = nullptr, bool addOnRow = false);
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
bool addOnRow);
template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias,
int group, float *bias = nullptr);
int group, T *bias = nullptr);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
......
......@@ -22,10 +22,11 @@ namespace operators {
namespace math {
template <>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
bool addOnRow) {
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias,
bool addOnRow) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -93,6 +94,16 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#endif
}
}
template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias) {
matmul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
matrix_out, beta, relu, bias, false);
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pool_2x2.h"
#include <algorithm>
#include <vector>
namespace paddle_mobile {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int w1 = input_width / 16;
int _w1 = input_width % 16;
int w2 = _w1 / 4;
int _w2 = _w1 % 4;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < input_height; ph += 2) {
const float *in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width;
const float *in_ptr2 = in_ptr1 + input_width;
if (ph != input_height && ph + 1 >= input_height) {
in_ptr2 = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * input_width));
memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
sizeof(float) * input_width);
}
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
#if __ARM_NEON
#if __aarch64__
#else
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vld1.f32 {q0, q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2, q3}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q6, q7}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q8, q9}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q2 \n\t"
"vmax.f32 q1, q1, q3 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q9 \n\t"
"vpmax.f32 d8, d0, d1 \n\t"
"vpmax.f32 d9, d2, d3 \n\t"
"vpmax.f32 d10, d12, d13 \n\t"
"vpmax.f32 d11, d14, d15 \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q0}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q1}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q1 \n\t"
"vpmax.f32 d4, d0, d1 \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9");
#endif
#endif
if (_w2 != 0) {
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
} else if (_w2 == 2) {
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
*out_ptr = (temp > temp1) ? temp : temp1;
} else if (_w2 == 3) {
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
*out_ptr = (temp > temp1) ? temp : temp1;
out_ptr++;
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
}
}
}
}
}
}
void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int w1 = input_width / 16;
int _w1 = input_width % 16;
int w2 = _w1 / 4;
int _w2 = _w1 % 4;
float quarter = 0.25;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < input_height; ph += 2) {
const float *in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width;
const float *in_ptr2 = in_ptr1 + input_width;
if (ph + 1 >= input_height) {
in_ptr2 = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * input_width));
memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
sizeof(float) * input_width);
}
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
#if __ARM_NEON
#if __aarch64__
#else
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vmov.f32 d0[0], %[quarter] \n\t"
"vld1.f32 {q1, q2}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q3, q4}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q7, q8}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q9, q10}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q3 \n\t"
"vadd.f32 q2, q2, q4 \n\t"
"vadd.f32 q7, q7, q9 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vpadd.f32 d10, d2, d3 \n\t"
"vpadd.f32 d11, d4, d5 \n\t"
"vpadd.f32 d12, d14, d15 \n\t"
"vpadd.f32 d13, d16, d17 \n\t"
"vmul.f32 q5, q5, d0[0] \n\t"
"vmul.f32 q6, q6, d0[0] \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q2 \n\t"
"vpadd.f32 d4, d2, d3 \n\t"
"vmul.f32 d4, d4, d0[0] \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
[quarter] "r"(quarter)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10");
#endif
#endif
if (_w2 != 0) {
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
} else if (_w2 == 2) {
float temp = 0;
temp += *in_ptr1;
temp += *in_ptr2;
in_ptr1++;
in_ptr2++;
temp += *in_ptr1;
temp += *in_ptr2;
*out_ptr = 0.25 * temp;
} else if (_w2 == 3) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1++;
temp += *in_ptr2++;
*out_ptr = 0.25 * temp;
out_ptr++;
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
}
}
}
}
}
}
//}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "framework/tensor.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::vector;
void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output);
void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
const Tensor *in_x, Tensor *out);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -22,10 +22,7 @@ namespace operators {
template <typename DeviceType, typename T>
void QuantizeOp<DeviceType, T>::InferShape() const {
auto input_dims = this->param_.input_->dims();
const std::vector<int> &paddings = this->param_.paddings_;
input_dims[2] += 2 * paddings[0];
input_dims[3] += 2 * paddings[1];
const auto &input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
auto scale_dims = framework::make_ddim(std::vector<int>{1});
this->param_.online_scale_->Resize(scale_dims);
......
此差异已折叠。
此差异已折叠。
......@@ -324,10 +324,6 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册