提交 982531ef 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1344 from hjchen2/dev-latest

Refactor pooling and dequant fusion implementation, fix some code style
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#ifdef ENABLE_EXCEPTION #ifdef ENABLE_EXCEPTION
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <exception> #include <exception>
#include <string> #include <string>
#endif #endif
namespace paddle_mobile { namespace paddle_mobile {
......
...@@ -24,7 +24,6 @@ const char *G_OP_TYPE_CONCAT = "concat"; ...@@ -24,7 +24,6 @@ const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant"; const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
...@@ -32,7 +31,6 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu"; ...@@ -32,7 +31,6 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc"; const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn"; const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul"; const char *G_OP_TYPE_MUL = "mul";
...@@ -41,6 +39,7 @@ const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform"; ...@@ -41,6 +39,7 @@ const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
const char *G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RELU = "relu";
const char *G_OP_TYPE_RELU6 = "relu6";
const char *G_OP_TYPE_RESHAPE = "reshape"; const char *G_OP_TYPE_RESHAPE = "reshape";
const char *G_OP_TYPE_RESHAPE2 = "reshape2"; const char *G_OP_TYPE_RESHAPE2 = "reshape2";
const char *G_OP_TYPE_SIGMOID = "sigmoid"; const char *G_OP_TYPE_SIGMOID = "sigmoid";
...@@ -73,9 +72,14 @@ const char *G_OP_TYPE_SUM = "sum"; ...@@ -73,9 +72,14 @@ const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn"; const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu"; const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu"; const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
"fusion_dequant_add_bn_quant";
const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
"fusion_dequant_add_bn_relu_quant";
const char *G_OP_TYPE_TANH = "tanh"; const char *G_OP_TYPE_TANH = "tanh";
const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
...@@ -91,6 +95,7 @@ std::unordered_map< ...@@ -91,6 +95,7 @@ std::unordered_map<
{G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}}, {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}}, {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}}, {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}}, {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}}, {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
...@@ -112,13 +117,11 @@ std::unordered_map< ...@@ -112,13 +117,11 @@ std::unordered_map<
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}}, {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
...@@ -142,9 +145,14 @@ std::unordered_map< ...@@ -142,9 +145,14 @@ std::unordered_map<
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}}, {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
{{"X", "Scale"}, {"Out", "OutScale"}}},
{G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
{{"X", "Scale"}, {"Out", "OutScale"}}},
{G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
......
...@@ -87,10 +87,24 @@ enum PMStatus { ...@@ -87,10 +87,24 @@ enum PMStatus {
}; };
enum RoundType { enum RoundType {
ROUND_UNK = 0, ROUND_NEAREST_AWAY_ZERO = 0,
ROUND_NEAREST_AWAY_ZERO = 1, ROUND_NEAREST_TOWARDS_ZERO = 1,
ROUND_NEAREST_TOWARDS_ZERO = 2, ROUND_NEAREST_TO_EVEN = 2,
ROUND_NEAREST_TO_EVEN = 3 };
enum ActivationType {
IDENTITY = 0,
RELU = 1,
RELU6 = 2,
PRELU = 3,
LEAKY_RELU = 4,
TANH = 5,
SIGMOID = 6,
};
enum PoolingType {
MAX = 0,
AVG = 1,
}; };
extern const char *G_OP_TYPE_CONV; extern const char *G_OP_TYPE_CONV;
...@@ -99,11 +113,9 @@ extern const char *G_OP_TYPE_BOX_CODER; ...@@ -99,11 +113,9 @@ extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT; extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD; extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC; extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FC_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
...@@ -116,6 +128,7 @@ extern const char *G_OP_TYPE_MULTICLASS_NMS; ...@@ -116,6 +128,7 @@ extern const char *G_OP_TYPE_MULTICLASS_NMS;
extern const char *G_OP_TYPE_POOL2D; extern const char *G_OP_TYPE_POOL2D;
extern const char *G_OP_TYPE_PRIOR_BOX; extern const char *G_OP_TYPE_PRIOR_BOX;
extern const char *G_OP_TYPE_RELU; extern const char *G_OP_TYPE_RELU;
extern const char *G_OP_TYPE_RELU6;
extern const char *G_OP_TYPE_RESHAPE; extern const char *G_OP_TYPE_RESHAPE;
extern const char *G_OP_TYPE_SIGMOID; extern const char *G_OP_TYPE_SIGMOID;
extern const char *G_OP_TYPE_SOFTMAX; extern const char *G_OP_TYPE_SOFTMAX;
...@@ -140,9 +153,12 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL; ...@@ -140,9 +153,12 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN; extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU; extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;
extern const char *G_OP_TYPE_TANH; extern const char *G_OP_TYPE_TANH;
extern const char *G_OP_TYPE_FUSION_DECONV_RELU; extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
......
...@@ -302,7 +302,15 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -302,7 +302,15 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost; if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
"Filter", inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
_tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
} else {
_tp[ops[i]->Type()] += timeCost;
}
} }
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>; using prof_t = std::pair<std::string, uint64_t>;
...@@ -372,6 +380,14 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod( ...@@ -372,6 +380,14 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto input_keys = ops[i]->GetInputKeys();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
input_keys[1], inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
printf("kernel size: %d\n", kernel_size);
}
_tp[ops[i]->Type()] += timeCost; _tp[ops[i]->Type()] += timeCost;
} }
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
......
...@@ -191,6 +191,7 @@ LOAD_OP2(mul, CPU, MALI_GPU); ...@@ -191,6 +191,7 @@ LOAD_OP2(mul, CPU, MALI_GPU);
#endif #endif
#ifdef RELU_OP #ifdef RELU_OP
LOAD_OP2(relu, CPU, MALI_GPU); LOAD_OP2(relu, CPU, MALI_GPU);
LOAD_OP1(relu6, CPU);
#endif #endif
#ifdef IM2SEQUENCE_OP #ifdef IM2SEQUENCE_OP
LOAD_OP1(im2sequence, CPU); LOAD_OP1(im2sequence, CPU);
...@@ -233,6 +234,10 @@ LOAD_OP1(quantize, CPU); ...@@ -233,6 +234,10 @@ LOAD_OP1(quantize, CPU);
#ifdef DEQUANT_OP #ifdef DEQUANT_OP
LOAD_OP1(dequantize, CPU); LOAD_OP1(dequantize, CPU);
#endif #endif
#ifdef FUSION_DEQUANT_BN_OP
LOAD_OP1(fusion_dequant_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_bn);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_OP #ifdef FUSION_DEQUANT_ADD_BN_OP
LOAD_OP1(fusion_dequant_add_bn, CPU); LOAD_OP1(fusion_dequant_add_bn, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn); LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
...@@ -245,3 +250,11 @@ LOAD_FUSION_MATCHER(fusion_dequant_bn_relu); ...@@ -245,3 +250,11 @@ LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
LOAD_OP1(fusion_dequant_add_bn_relu, CPU); LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu); LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
#endif #endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
#endif
...@@ -98,24 +98,6 @@ class OpRegistry { ...@@ -98,24 +98,6 @@ class OpRegistry {
} }
}; };
#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
template class op_class<device_type, int8_t>; \
template <typename Dtype, typename T> \
class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \
}; \
static paddle_mobile::framework::OperatorRegistrar< \
device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
__op_registrar_##op_type##_##device_name(#op_type); \
int TouchOpRegistrar_##op_type##_##device_name() { \
__op_registrar_##op_type##_##device_name.Touch(); \
return 0; \
}
#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \ #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template class op_class<device_type, float>; \ template class op_class<device_type, float>; \
template <typename Dtype, typename T> \ template <typename Dtype, typename T> \
......
...@@ -220,7 +220,16 @@ void Node::Folder( ...@@ -220,7 +220,16 @@ void Node::Folder(
} }
} else { } else {
for (auto &op_output : this->op_desc_->outputs_) { for (auto &op_output : this->op_desc_->outputs_) {
op_desc->outputs_.emplace(op_output.first, op_output.second); auto output_key = op_output.first;
if (change->find(this->type_) != change->end()) {
const auto change_pairs = (*change)[this->type_];
for (const auto &target : change_pairs) {
if (target.first == output_key) {
output_key = target.second;
}
}
}
op_desc->outputs_.emplace(output_key, op_output.second);
} }
for (auto &output : this->outputs_) { for (auto &output : this->outputs_) {
......
...@@ -143,12 +143,10 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { ...@@ -143,12 +143,10 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
int t1 = 1; int t1 = 1;
int t2 = 1; int t2 = 1;
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
unsigned int seed = 100; a[i] = t1 + rand() % t2; // NOLINT
a[i] = t1 + rand_r(&seed) % t2;
} }
for (int i = 0; i < k * n; ++i) { for (int i = 0; i < k * n; ++i) {
unsigned int seed = 200; b[i] = t1 + rand() % t2; // NOLINT
b[i] = t1 + rand_r(&seed) % t2;
} }
paddle_mobile::operators::math::Gemm gemm; paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include "operators/fusion_conv_add_relu_int8_op.h"
#include <vector>
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
ops::FusionConvAddReluInt8Op);
#endif
#endif // FUSION_CONVADDRELU_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h" #include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_add_bn_kernel.h" #include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher { ...@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"}, {{"Scale", "BNScale"},
{"Mean", "BNMean"}, {"Mean", "BNMean"},
{"Bias", "BNBias"}, {"Bias", "BNBias"},
{"Variance", "BNVariance"}}}}, {"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes); removed_nodes);
} }
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h" #include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_relu_kernel.h" #include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher { ...@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"}, {{"Scale", "BNScale"},
{"Mean", "BNMean"}, {"Mean", "BNMean"},
{"Bias", "BNBias"}, {"Bias", "BNBias"},
{"Variance", "BNVariance"}}}}, {"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes); removed_nodes);
} }
...@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher { ...@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FusionDequantAddBNReluOp class FusionDequantAddBNReluOp
: public framework::OperatorWithKernel< : public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>, DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>> { operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
public: public:
FusionDequantAddBNReluOp(const std::string &type, FusionDequantAddBNReluOp(const std::string &type,
...@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp ...@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel< : framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluParam<DeviceType>, DeviceType, FusionDequantAddBNParam<DeviceType>,
operators::FusionDequantAddBNReluKernel<DeviceType, T>>( operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {} type, inputs, outputs, attrs, scope) {}
// inference output shape // inference output shape
......
...@@ -12,50 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,50 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_FC_INT8_OP #include "operators/fusion_dequant_add_bn_relu_quant_op.h"
#include "operators/fusion_fc_int8_op.h"
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void FusionFcInt8Op<Dtype, T>::InferShape() const { void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims(); const auto& input_dims = this->param_.input_->dims();
auto y_dims = this->param_.InputY()->dims(); this->param_.output_->Resize(input_dims);
int x_num_col_dims = this->param_.XNumColDims(); }
int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
assert(x_mat_dims[1] == y_mat_dims[0]); } // namespace operators
} // namespace paddle_mobile
std::vector<int64_t> output_dims; namespace ops = paddle_mobile::operators;
output_dims.reserve( REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims)); ops::FusionDequantAddBNReluQuantMatcher);
for (int i = 0; i < x_num_col_dims; ++i) { #ifdef PADDLE_MOBILE_CPU
output_dims.push_back(x_dims[i]); REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
} ops::FusionDequantAddBNReluQuantOp);
#endif
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
for (int i = y_num_col_dims; i < y_dims.size(); ++i) { #ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
output_dims.push_back(y_dims[i]); namespace paddle_mobile {
} namespace operators {
framework::DDim ddim = framework::make_ddim(output_dims); template <typename Dtype, typename T>
this->param_.Out()->Resize(ddim); void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
ops::FusionDequantAddBNQuantMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op); REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
ops::FusionDequantAddBNQuantOp);
#endif #endif
#endif // FUSION_FC_INT8_OP
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
public:
FusionDequantAddBNReluQuantMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
};
template <typename DeviceType, typename T>
class FusionDequantAddBNReluQuantOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
public:
FusionDequantAddBNReluQuantOp(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
public:
FusionDequantAddBNQuantMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
};
template <typename DeviceType, typename T>
class FusionDequantAddBNQuantOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
public:
FusionDequantAddBNQuantOp(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_DEQUANT_BN_RELU_OP #include "operators/fusion_dequant_bn_op.h"
#include "operators/fusion_dequant_bn_relu_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
#ifdef FUSION_DEQUANT_BN_OP
template <typename Dtype, typename T>
void FusionDequantBNOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename Dtype, typename T> template <typename Dtype, typename T>
void FusionDequantBNReluOp<Dtype, T>::InferShape() const { void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims(); const auto& input_dims = this->param_.input_->dims();
this->param_.output_->Resize(input_dims); this->param_.output_->Resize(input_dims);
} }
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef FUSION_DEQUANT_BN_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu, REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
ops::FusionDequantBNReluMatcher); ops::FusionDequantBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp); REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
#endif #endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_RELU_OP
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
class FusionDequantBNMatcher : public framework::FusionOpMatcher {
public:
FusionDequantBNMatcher() {
node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
virtual void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "BNScale"},
{"Mean", "BNMean"},
{"Bias", "BNBias"},
{"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes);
}
std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
};
#endif // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_BN_OP
template <typename DeviceType, typename T>
class FusionDequantBNOp : public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>> {
public:
FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
public:
FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDequantBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>> {
public:
FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDequantBNParam<DeviceType>,
operators::FusionDequantBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
#endif // FUSION_DEQUANT_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher { ...@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
{{"Scale", "BNScale"}, {{"Scale", "BNScale"},
{"Mean", "BNMean"}, {"Mean", "BNMean"},
{"Bias", "BNBias"}, {"Bias", "BNBias"},
{"Variance", "BNVariance"}}}}, {"Variance", "BNVariance"},
{"Y", "Out"}}}},
removed_nodes); removed_nodes);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionFcInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>> {
public:
FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
...@@ -32,20 +32,6 @@ void ConvAddReluKernel<CPU, float>::Compute( ...@@ -32,20 +32,6 @@ void ConvAddReluKernel<CPU, float>::Compute(
} }
template class ConvAddReluKernel<CPU, float>; template class ConvAddReluKernel<CPU, float>;
#ifdef FUSION_CONVADDRELU_INT8_OP
template <>
bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddReluKernel<CPU, int8_t>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<int8_t, int32_t>(param);
}
template class ConvAddReluKernel<CPU, int8_t>;
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "operators/kernel/dequant_add_bn_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
// elementwise add params
const Tensor *bias = param->bias_;
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *bias_ptr = bias->data<float>();
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
}
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
const int32_t *input = param.input_->data<int32_t>();
const float *bn_scale = param.bn_scale_->data<float>();
const float *bn_bias = param.bn_bias_->data<float>();
// dequantize params
const float activation_scale = param.activation_scale_->data<float>()[0];
const float weight_scale = param.weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param.output_->mutable_data<float>();
int batch_size = param.input_->dims()[0];
int channels = param.input_->dims()[1];
size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
float scale = bn_scale[c] * dequant_scale;
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = scale * x[k] + bias;
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DEQUANT_ADD_BN_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param->output_->mutable_data<float>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
float scale = bn_scale[c] * dequant_scale;
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __zero = vdupq_n_f32(0.f);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = vmaxq_f32(__zero, f0);
f1 = vmaxq_f32(__zero, f1);
f2 = vmaxq_f32(__zero, f2);
f3 = vmaxq_f32(__zero, f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = std::max(scale * x[k] + bias, 0.f);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <>
bool FusionDequantBNReluKernel<CPU, float>::Init(
FusionDequantBNReluParam<CPU> *param) {
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c];
}
return true;
}
template <>
void FusionDequantBNReluKernel<CPU, float>::Compute(
const FusionDequantBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <>
bool FusionDequantAddBNReluKernel<CPU, float>::Init(
FusionDequantAddBNReluParam<CPU> *param) {
// elementwise add params
const Tensor *bias = param->bias_;
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *bias_ptr = bias->data<float>();
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
bn_scale_ptr[c] = inv_scale;
bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
}
return true;
}
template <>
void FusionDequantAddBNReluKernel<CPU, float>::Compute(
const FusionDequantAddBNReluParam<CPU> &param) {
DequantBNReluCompute(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/math/activation.h"
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
const framework::Tensor *bias) {
// batch norm params
const Tensor *bn_mean = param->bn_mean_;
const Tensor *bn_variance = param->bn_variance_;
Tensor *bn_scale = param->bn_scale_;
Tensor *bn_bias = param->bn_bias_;
const float epsilon = param->epsilon_;
const float *mean_ptr = bn_mean->data<float>();
const float *var_ptr = bn_variance->data<float>();
float *bn_scale_ptr = bn_scale->mutable_data<float>();
float *bn_bias_ptr = bn_bias->mutable_data<float>();
for (int c = 0; c < bn_scale->numel(); ++c) {
float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
float val = bias ? bias->data<float>()[c] : 0;
bn_bias_ptr[c] =
inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
}
}
#endif
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
template <ActivationType Act>
void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
float *output = param->output_->mutable_data<float>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
float *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = math::vActiveq_f32<Act>(f0);
f1 = math::vActiveq_f32<Act>(f1);
f2 = math::vActiveq_f32<Act>(f2);
f3 = math::vActiveq_f32<Act>(f3);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_OP
template <>
bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNKernel<CPU, float>::Compute(
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<IDENTITY>(&param);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template <>
bool FusionDequantBNReluKernel<CPU, float>::Init(
FusionDequantBNParam<CPU> *param) {
PublicFusionDequantBNInitParam(param, nullptr);
return true;
}
template <>
void FusionDequantBNReluKernel<CPU, float>::Compute(
const FusionDequantBNParam<CPU> &param) {
DequantBNCompute<RELU>(&param);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_OP
template <>
bool FusionDequantAddBNKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<IDENTITY>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <>
bool FusionDequantAddBNReluKernel<CPU, float>::Init(
FusionDequantAddBNParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNReluKernel<CPU, float>::Compute(
const FusionDequantAddBNParam<CPU> &param) {
DequantBNCompute<RELU>(&param);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template <Activation Act, RoundType R>
void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
const int32_t *input = param->input_->data<int32_t>();
const float *bn_scale = param->bn_scale_->data<float>();
const float *bn_bias = param->bn_bias_->data<float>();
// dequantize params
const float activation_scale = param->activation_scale_->data<float>()[0];
const float weight_scale = param->weight_scale_;
const float dequant_scale = activation_scale / weight_scale;
// quantize params
Tensor *output_scale = param->online_scale_;
float max_abs = 0.f;
int8_t *output = param->output_->mutable_data<int8_t>();
int batch_size = param->input_->dims()[0];
int channels = param->input_->dims()[1];
size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
// if (param->is_static_) {
if (true) {
max_abs = param->static_scale_;
float quant_scale = 127.f / max_abs;
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float scale = bn_scale[c];
float bias = bn_bias[c];
size_t offset = (batch * channels + c) * spatial_size;
const int32_t *x = input + offset;
int8_t *y = output + offset;
size_t remain = spatial_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
float32x4_t __scale = vdupq_n_f32(scale);
float32x4_t __bias = vdupq_n_f32(bias);
float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
for (int k = 0; k < loop; ++k, x += 16, y += 16) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(__dequant_scale, f0);
f1 = vmulq_f32(__dequant_scale, f1);
f2 = vmulq_f32(__dequant_scale, f2);
f3 = vmulq_f32(__dequant_scale, f3);
f0 = vmlaq_f32(__bias, __scale, f0);
f1 = vmlaq_f32(__bias, __scale, f1);
f2 = vmlaq_f32(__bias, __scale, f2);
f3 = vmlaq_f32(__bias, __scale, f3);
f0 = math::vActiveq_f32<Act>(f0);
f1 = math::vActiveq_f32<Act>(f1);
f2 = math::vActiveq_f32<Act>(f2);
f3 = math::vActiveq_f32<Act>(f3);
f0 = vmulq_f32(__quant_scale, f0);
f1 = vmulq_f32(__quant_scale, f1);
f2 = vmulq_f32(__quant_scale, f2);
f3 = vmulq_f32(__quant_scale, f3);
int32x4_t q0 = math::vRoundq_f32<R>(f0);
int32x4_t q1 = math::vRoundq_f32<R>(f1);
int32x4_t q2 = math::vRoundq_f32<R>(f2);
int32x4_t q3 = math::vRoundq_f32<R>(f3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
}
#endif // __ARM_NEON__
for (int k = 0; k < remain; ++k) {
float x_temp =
math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
y[k] = math::Round<R>(x_temp * quant_scale);
}
}
}
} else {
// TODO(hjchen2)
max_abs = std::max(max_abs, 1e-6f);
}
param->online_scale_->mutable_data<float>()[0] = max_abs;
}
template <>
bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
FusionDequantAddBNQuantParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
}
}
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template <>
bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
FusionDequantAddBNQuantParam<CPU> *param) {
const framework::Tensor *bias = param->bias_;
PublicFusionDequantBNInitParam(param, bias);
return true;
}
template <>
void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
const FusionDequantAddBNQuantParam<CPU> &param) {
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
break;
case ROUND_NEAREST_AWAY_ZERO:
DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
break;
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
}
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/quantize_kernel.h" #include "operators/kernel/quantize_kernel.h"
#include <cmath> #include <cmath>
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
...@@ -32,81 +33,68 @@ inline float32_t vmaxvq_f32(float32x4_t r) { ...@@ -32,81 +33,68 @@ inline float32_t vmaxvq_f32(float32x4_t r) {
} }
#endif #endif
template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO> template <RoundType R>
inline int32x4_t vround_f32(float32x4_t r) { inline void QuantizeOffline(const Tensor *input, const float scale,
return vcvtq_s32_f32(r); const float max_abs, Tensor *output) {
} const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
template <> size_t remain = input->numel();
inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) { #if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4_t plus = vdupq_n_f32(0.5); size_t loop = remain >> 4;
float32x4_t minus = vdupq_n_f32(-0.5); remain = remain & 0xF;
float32x4_t zero = vdupq_n_f32(0); float32x4_t __scale = vdupq_n_f32(scale);
uint32x4_t more_than_zero = vcgtq_f32(r, zero); float32x4_t __postive_max = vdupq_n_f32(max_abs);
float32x4_t temp = vbslq_f32(more_than_zero, plus, minus); float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
temp = vaddq_f32(r, temp); #pragma omp parallel for
int32x4_t ret = vcvtq_s32_f32(temp); for (size_t i = 0; i < loop; ++i) {
return ret; const float *local_x = x + (i << 4);
} int8_t *local_y = y + (i << 4);
float32x4_t r0 = vld1q_f32(local_x);
template <> float32x4_t r1 = vld1q_f32(local_x + 4);
inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) { float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t point5 = vdupq_n_f32(0.5); float32x4_t r3 = vld1q_f32(local_x + 12);
int32x4_t one = vdupq_n_s32(1); r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
int32x4_t zero = vdupq_n_s32(0); r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r); r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
float32x4_t frnd = vcvtq_f32_s32(rnd); r0 = vmulq_f32(r0, __scale);
frnd = vsubq_f32(frnd, r); r1 = vmulq_f32(r1, __scale);
frnd = vabsq_f32(frnd); r2 = vmulq_f32(r2, __scale);
uint32x4_t equal_point5 = vceqq_f32(frnd, point5); r3 = vmulq_f32(r3, __scale);
int32x4_t abs_rnd = vabsq_s32(rnd); int32x4_t q0 = math::vRoundq_f32<R>(r0);
abs_rnd = vandq_s32(abs_rnd, one); int32x4_t q1 = math::vRoundq_f32<R>(r1);
uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd); int32x4_t q2 = math::vRoundq_f32<R>(r2);
uint32x4_t mask = vandq_u32(equal_point5, not_mod2); int32x4_t q3 = math::vRoundq_f32<R>(r3);
uint32x4_t more_than_zero = vcgtq_s32(rnd, zero); int16x4_t d0 = vmovn_s32(q0);
more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one)); int16x4_t d1 = vmovn_s32(q1);
mask = veorq_u32(more_than_zero, mask); int16x4_t d2 = vmovn_s32(q2);
more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one)); int16x4_t d3 = vmovn_s32(q3);
mask = vaddq_u32(more_than_zero, mask); int16x8_t q5 = vcombine_s16(d0, d1);
int32x4_t smask = vreinterpretq_s32_u32(mask); int16x8_t q6 = vcombine_s16(d2, d3);
smask = vsubq_s32(smask, one); int8x8_t d5 = vmovn_s16(q5);
rnd = vaddq_s32(rnd, smask); int8x8_t d6 = vmovn_s16(q6);
return rnd; vst1_s8(local_y, d5);
} vst1_s8(local_y + 8, d6);
}
x += (loop << 4);
y += (loop << 4);
#endif #endif
for (size_t i = 0; i < remain; ++i) {
template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO> float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
inline int8_t Round(const float &x) { y[i] = math::Round<R>(x_temp * scale);
return static_cast<int8_t>(x);
}
template <>
inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
return std::round(x);
}
template <>
inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
float v = std::round(x);
int32_t q = static_cast<int32_t>(v);
if (std::abs(std::abs(q - v) - 0.5) <= 0) {
if (std::abs(q) % 2 != 0) {
q = q + ((q > 0) ? -1 : 1);
}
} }
return static_cast<int8_t>(q);
} }
template <RoundType R> template <RoundType R>
static void Quantize(const Tensor *input, const float scale, Tensor *output) { inline void QuantizeOnline(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>(); const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>(); int8_t *y = output->mutable_data<int8_t>();
size_t remain = input->numel(); size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = remain >> 4; size_t loop = remain >> 4;
remain = remain & 0xF; remain = remain & 0xF;
float32x4_t __scale = vdupq_n_f32(scale);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < loop; ++i) { for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4); const float *local_x = x + (i << 4);
...@@ -115,14 +103,14 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) { ...@@ -115,14 +103,14 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
float32x4_t r1 = vld1q_f32(local_x + 4); float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8); float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12); float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale); r0 = vmulq_f32(r0, __scale);
r1 = vmulq_n_f32(r1, scale); r1 = vmulq_f32(r1, __scale);
r2 = vmulq_n_f32(r2, scale); r2 = vmulq_f32(r2, __scale);
r3 = vmulq_n_f32(r3, scale); r3 = vmulq_f32(r3, __scale);
int32x4_t q0 = vround_f32<R>(r0); int32x4_t q0 = math::vRoundq_f32<R>(r0);
int32x4_t q1 = vround_f32<R>(r1); int32x4_t q1 = math::vRoundq_f32<R>(r1);
int32x4_t q2 = vround_f32<R>(r2); int32x4_t q2 = math::vRoundq_f32<R>(r2);
int32x4_t q3 = vround_f32<R>(r3); int32x4_t q3 = math::vRoundq_f32<R>(r3);
int16x4_t d0 = vmovn_s32(q0); int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1); int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2); int16x4_t d2 = vmovn_s32(q2);
...@@ -138,7 +126,18 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) { ...@@ -138,7 +126,18 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
y += (loop << 4); y += (loop << 4);
#endif #endif
for (size_t i = 0; i < remain; ++i) { for (size_t i = 0; i < remain; ++i) {
y[i] = Round<R>(x[i] * scale); y[i] = math::Round<R>(x[i] * scale);
}
}
template <RoundType R>
static void Quantize(const Tensor *input, const float max_abs,
const bool offline, Tensor *output) {
float scale = 127.f / max_abs;
if (offline) {
QuantizeOffline<R>(input, scale, max_abs, output);
} else {
QuantizeOnline<R>(input, scale, output);
} }
} }
...@@ -173,6 +172,13 @@ float find_abs_max(const Tensor *input) { ...@@ -173,6 +172,13 @@ float find_abs_max(const Tensor *input) {
return max_abs; return max_abs;
} }
} // namespace operators
} // namespace paddle_mobile
#endif // __ARM_NEON__
namespace paddle_mobile {
namespace operators {
template <> template <>
bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) { bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
return true; return true;
...@@ -184,24 +190,23 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) { ...@@ -184,24 +190,23 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
Tensor *output = param.output_; Tensor *output = param.output_;
Tensor *output_scale = param.online_scale_; Tensor *output_scale = param.online_scale_;
float max_abs = 0.f; float max_abs = 0.f;
if (param.is_static_) { if (param.offline_) {
max_abs = param.static_scale_; max_abs = param.offline_scale_->data<float>()[0];
} else { } else {
max_abs = find_abs_max(input); max_abs = find_abs_max(input);
} }
max_abs = std::max(max_abs, 1e-6f); max_abs = std::max(max_abs, 1e-6f);
// only support int8 currently
float scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = max_abs; param.online_scale_->mutable_data<float>()[0] = max_abs;
switch (param.round_type_) { switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN: case ROUND_NEAREST_TO_EVEN:
Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output); Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
break; break;
case ROUND_NEAREST_TOWARDS_ZERO: case ROUND_NEAREST_TOWARDS_ZERO:
Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output); Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
output);
break; break;
case ROUND_NEAREST_AWAY_ZERO: case ROUND_NEAREST_AWAY_ZERO:
Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output); Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
break; break;
default: default:
LOG(kLOG_ERROR) << "round type is not supported."; LOG(kLOG_ERROR) << "round type is not supported.";
...@@ -212,4 +217,4 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) { ...@@ -212,4 +217,4 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // QUANT_OP
...@@ -15,11 +15,56 @@ limitations under the License. */ ...@@ -15,11 +15,56 @@ limitations under the License. */
#ifdef RELU_OP #ifdef RELU_OP
#include "operators/kernel/relu_kernel.h" #include "operators/kernel/relu_kernel.h"
#include "operators/kernel/central-arm-func/relu_arm_func.h" #include "common/types.h"
#include "operators/math/activation.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, ActivationType Act>
struct ReluCompute {
void operator()(const Tensor *input, Tensor *output) {}
};
template <ActivationType Act>
struct ReluCompute<float, Act> {
void operator()(const Tensor *input, Tensor *output) {
const float *x = input->data<float>();
float *y = output->mutable_data<float>();
size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = remain >> 4;
remain = remain & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4);
float *local_y = y + (i << 4);
float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = math::vActiveq_f32<Act>(r0);
r1 = math::vActiveq_f32<Act>(r1);
r2 = math::vActiveq_f32<Act>(r2);
r3 = math::vActiveq_f32<Act>(r3);
vst1q_f32(local_y, r0);
vst1q_f32(local_y + 4, r1);
vst1q_f32(local_y + 8, r2);
vst1q_f32(local_y + 12, r3);
}
x += (loop << 4);
y += (loop << 4);
#endif
for (size_t i = 0; i < remain; ++i) {
y[i] = math::Active<Act>(x[i]);
}
}
};
template <> template <>
bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) { bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
return true; return true;
...@@ -27,7 +72,21 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) { ...@@ -27,7 +72,21 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
template <> template <>
void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) { void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
ReluCompute<float>(param); const Tensor *input = param.InputX();
Tensor *output = param.Out();
ReluCompute<float, RELU>()(input, output);
}
template <>
bool Relu6Kernel<CPU, float>::Init(ReluParam<CPU> *param) {
return true;
}
template <>
void Relu6Kernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
const Tensor *input = param.InputX();
Tensor *output = param.Out();
ReluCompute<float, RELU6>()(input, output);
} }
} // namespace operators } // namespace operators
......
...@@ -11,14 +11,111 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,14 +11,111 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef TRANSPOSE2_OP #ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h" #include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
bool IsShuffleChannel(const std::vector<int> &axis) {
bool is_shuffle_channel = true;
if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
for (int i = 3; i < axis.size(); ++i) {
if (axis[i] != i) {
is_shuffle_channel = false;
break;
}
}
} else {
return false;
}
return is_shuffle_channel;
}
template <typename Dtype>
void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
const std::vector<int> &axis = param.Axis();
const Tensor *input = param.InputX();
const Dtype *input_ptr = input->data<Dtype>();
Tensor *output = param.Out();
Dtype *output_ptr = output->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const framework::DDim &in_dim = input->dims();
const framework::DDim &out_dim = output->dims();
size_t offset = 1;
for (int i = 3; i < axis.size(); ++i) {
offset *= in_dim[i];
}
#pragma omp parallel for collapse(3)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int c1 = 0; c1 < out_dim[1]; ++c1) {
for (int c2 = 0; c2 < out_dim[2]; ++c2) {
size_t out_offset =
((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
memcpy(output_ptr + out_offset, input_ptr + in_offset,
offset * sizeof(Dtype));
}
}
}
}
template <typename Dtype>
void Transpose2Compute(const Transpose2Param<CPU> &param) {
const std::vector<int> &axis = param.Axis();
const Tensor *input = param.InputX();
const Dtype *input_ptr = input->data<Dtype>();
Tensor *output = param.Out();
Dtype *output_ptr = output->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const framework::DDim &in_dim = input->dims();
const framework::DDim &out_dim = output->dims();
// precompute inverted output dim and strides
size_t rout_dim[6], strides[6];
int permute = axis.size(); // permute must >=2 && <= 6.
for (int i = 0; i < permute; ++i) {
int k = permute - 1 - i;
strides[k] = 1;
for (int j = axis[i] + 1; j < permute; ++j) {
strides[k] *= in_dim[j];
}
rout_dim[k] = out_dim[i];
}
// unroll the first 2 dimensions
int reamin_dim = 1;
for (int i = 2; i < out_dim.size(); ++i) {
reamin_dim *= out_dim[i];
}
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int j = 0; j < out_dim[1]; ++j) {
size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
int indics[4] = {0, 0, 0, 0};
for (int k = 0; k < reamin_dim; ++k) {
out_ptr[k] = input_ptr[offset];
indics[0] += 1;
offset += strides[0];
for (int p = 0; p < permute - 3; ++p) {
if (indics[p] == rout_dim[p]) {
indics[p + 1] += 1;
indics[p] = 0;
offset += strides[p + 1];
offset -= rout_dim[p] * strides[p];
} else {
break;
}
}
}
}
}
}
template <> template <>
bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) { bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
return true; return true;
...@@ -26,10 +123,24 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) { ...@@ -26,10 +123,24 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
template <> template <>
void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) { void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
Transpose2Compute<float>(param); const std::vector<int> &axis = param.Axis();
bool shuffle_channel = IsShuffleChannel(axis);
if (shuffle_channel) {
if (param.InputX()->type() == typeid(int8_t)) {
ShuffleChannelCompute<int8_t>(param);
} else {
ShuffleChannelCompute<float>(param);
}
} else {
if (param.InputX()->type() == typeid(int8_t)) {
Transpose2Compute<int8_t>(param);
} else {
Transpose2Compute<float>(param);
}
}
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // TRANSPOSE2_OP
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
void ConvAddBasic(const FusionConvAddParam<CPU> &param) { void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
...@@ -106,9 +107,9 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) { ...@@ -106,9 +107,9 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false, math::matmul<float, float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data); static_cast<float>(1), false, biase_data);
} }
} }
} }
......
...@@ -25,24 +25,18 @@ limitations under the License. */ ...@@ -25,24 +25,18 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename P, typename S> template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
int32_t axis = param.Axis(); int32_t axis = param.Axis();
S *bias_data = bias.data<S>(); Otype *bias_data = bias.data<Otype>();
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<P>(); output->mutable_data<Otype>();
float alpha = 1.0f; float alpha = 1.0f;
float beta = 1.0f; float beta = 1.0f;
#ifdef FUSION_CONVADDRELU_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
#endif
int32_t groups = param.Groups(); int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides(); std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings(); std::vector<int32_t> paddings = param.Paddings();
...@@ -70,7 +64,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -70,7 +64,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor col; Tensor col;
Tensor col_matrix; Tensor col_matrix;
if (is_expand) { if (is_expand) {
col.mutable_data<P>(col_shape); col.mutable_data<Itype>(col_shape);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
...@@ -89,8 +83,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -89,8 +83,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups; int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups; int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, P> vol2col; math::Vol2ColFunctor<CPU, Itype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col; math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
for (int32_t i = 0; i < batch_size; i++) { for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
...@@ -118,8 +112,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -118,8 +112,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice, math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
beta, true, bias_data); &out_slice, beta, true, bias_data);
} }
} }
} }
......
...@@ -106,9 +106,10 @@ inline void GemmConv(const ConvParam<CPU> &param) { ...@@ -106,9 +106,10 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false, math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0), static_cast<float>(1), &out_slice,
false, static_cast<Otype *>(nullptr)); static_cast<float>(0), false,
static_cast<Otype *>(nullptr));
} }
} }
} }
......
...@@ -93,8 +93,8 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) { ...@@ -93,8 +93,8 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0), math::matmul<P, P>(filter_slice, true, in_slice, false,
&col_matrix, static_cast<P>(0.0)); static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
if (data_dim == 2U) { if (data_dim == 2U) {
col2im(col, dilations, strides, col2im(col, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0], std::vector<int>{paddings[0], paddings[1], paddings[0],
......
...@@ -23,20 +23,15 @@ limitations under the License. */ ...@@ -23,20 +23,15 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename P, typename S> template <typename Itype, typename Otype>
void FusionFcCompute(const FusionFcParam<CPU> &param) { void FusionFcCompute(const FusionFcParam<CPU> &param) {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY(); const Tensor *input_y = param.InputY();
Tensor *input_z = param.InputZ(); Tensor *input_z = param.InputZ();
S *input_z_data = input_z->data<S>(); Otype *input_z_data = input_z->data<Otype>();
int axis = param.Axis(); int axis = param.Axis();
Tensor *out = param.Out(); Tensor *out = param.Out();
// int m = out->dims()[0]; auto *out_data = out->mutable_data<Itype>();
// int n = out->dims()[1];
auto *out_data = out->mutable_data<P>();
float alpha = 1.0f;
float beta = 1.0f;
const Tensor x_matrix = const Tensor x_matrix =
input_x->dims().size() > 2 input_x->dims().size() > 2
...@@ -57,28 +52,14 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) { ...@@ -57,28 +52,14 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. "); PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
if (std::is_same<P, int8_t>::value) { // bias_data的维度和out的第二个维度一致
#ifdef FUSION_FC_INT8_OP int64_t classes = input_z->numel();
alpha = param.InputScale()->data<float>()[0]; for (int i = 0; i < out_dim[0]; i++) {
beta = 0.0f; memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
input_z_data, true);
#endif
} else {
// bias_data的维度和out的第二个维度一致
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data,
sizeof(float) * classes);
}
math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
false);
} }
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); math::matmul<Itype, Otype>(x_matrix, false, y_matrix, false,
// if (out_dim.size() != 2) { static_cast<float>(1), out, static_cast<float>(1),
// out->Resize(out_dim); false);
// }
} }
} // namespace operators } // namespace operators
......
...@@ -73,14 +73,14 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -73,14 +73,14 @@ void MulCompute(const MulParam<CPU> &param) {
} }
if (param.InputX()->type() == typeid(int8_t)) { if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>(); out->mutable_data<int32_t>();
math::matmul<float, int32_t>(x_matrix, false, y_matrix, false, math::matmul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), out,
static_cast<float>(0)); static_cast<float>(0));
} else { } else {
out->mutable_data<float>(); out->mutable_data<float>();
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::matmul<float, float>(x_matrix, false, y_matrix, false,
out, static_cast<float>(0)); static_cast<float>(1), out,
static_cast<float>(0));
} }
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize(out_dim); out->Resize(out_dim);
......
...@@ -17,103 +17,53 @@ limitations under the License. */ ...@@ -17,103 +17,53 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "common/types.h"
#include "operators/math/pooling.h" #include "operators/math/pooling.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using framework::Tensor;
template <typename T, typename S>
void PoolBasic(std::string pooling_type, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings,
const Tensor *in_x, Tensor *out) {
if (pooling_type == "max") {
math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
math::MaxPool<T> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
} else if (pooling_type == "avg") {
math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
math::AvgPool<T, S> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
}
}
template <typename P> template <typename P>
void PoolCompute(const PoolParam<CPU> &param) { void PoolCompute(const PoolParam<CPU> &param) {
const Tensor *in_x = param.Input(); const framework::Tensor *input = param.Input();
Tensor *out = param.Output(); framework::Tensor *output = param.Output();
std::string pooling_type = param.PoolingType(); const std::string &pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize(); std::vector<int> ksize = param.Ksize();
std::vector<int> strides = param.Strides(); std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings(); std::vector<int> paddings = param.Paddings();
if (ksize.size() != 2) {
LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "Pool op only supports 2D and 3D input.";
}
if (param.isGlobalPooling()) { if (param.isGlobalPooling()) {
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0; paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]); ksize[i] = static_cast<int>(input->dims()[i + 2]);
} }
} }
if (in_x->type() == typeid(int8_t)) { if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) { if (pooling_type == "max" && strides[0] == strides[1]) {
if (strides[0] == strides[1] && strides[0] == 1) { if (strides[0] == 1) {
math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]); math::Pooling3x3<MAX, 1>()(*input, paddings, output);
} else if (strides[0] == strides[1] && strides[0] == 2) { } else if (strides[0] == 2) {
math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]); math::Pooling3x3<MAX, 2>()(*input, paddings, output);
} else { } else {
math::Pool3x3Max_int8(strides, paddings, in_x, out); math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
}
} else if (pooling_type == "avg" && strides[0] == strides[1]) {
if (strides[0] == 1) {
math::Pooling3x3<AVG, 1>()(*input, paddings, output);
} else if (strides[0] == 2) {
math::Pooling3x3<AVG, 2>()(*input, paddings, output);
} else {
math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
} }
} else { } else {
PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x, // Others
out);
} }
} else { } else {
if (ksize[0] == 3 && ksize[0] == ksize[1]) { if (pooling_type == "max") {
if (pooling_type == "max") { math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
if (strides[0] == strides[1] && strides[0] == 1 && } else if (pooling_type == "avg") {
paddings[0] == paddings[1] && paddings[1] == 1) { math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
math::Pool3x3Maxs1p1(in_x, out);
} else {
math::Pool3x3Max(strides, paddings, in_x, out);
}
} else if (pooling_type == "avg") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Avgs1p1(in_x, out);
} else {
math::Pool3x3Avg(strides, paddings, in_x, out);
}
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
strides[0] == strides[1] && paddings[0] == paddings[1] &&
paddings[1] == 0) {
#if __ARM_NEON
#if __aarch64__
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#else
/// todo: fix bug in Pool2x2
if (pooling_type == "max") {
math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
}
#endif
#else
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#endif // __ARM_NEON
} else { } else {
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x, // Others
out);
} }
} }
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include <operators/math/transform.h>
#include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <typename T>
struct ReluFunctor {
inline T operator()(T in) const { return in > 0 ? in : 0; }
};
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
template <typename P>
void ReluCompute(const ReluParam<CPU> &param) {
const auto *input_x = param.InputX();
auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#if __aarch64__
if (numel > 0) {
int loop = numel >> 0x4;
int remain = numel & 0xF;
float32x4_t zero = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(input_x_ptr);
float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
r0 = vmaxq_f32(r0, zero);
r1 = vmaxq_f32(r1, zero);
r2 = vmaxq_f32(r2, zero);
r3 = vmaxq_f32(r3, zero);
vst1q_f32(out_ptr, r0);
vst1q_f32(out_ptr + 4, r1);
vst1q_f32(out_ptr + 8, r2);
vst1q_f32(out_ptr + 12, r3);
input_x_ptr += 16;
out_ptr += 16;
}
for (int i = 0; i < remain; ++i) {
out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
}
#else
if (numel > 64) {
asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
#endif
} else {
#endif
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
}
#endif
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TRANSPOSE2_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void Transpose2Compute(const Transpose2Param<CPU>& param) {
const auto* input_x = param.InputX();
const auto input_x_dims = input_x->dims();
auto* out = param.Out();
const auto axis = param.Axis();
const auto* input_x_data = input_x->data<float>();
auto* out_data = out->mutable_data<float>();
size_t ndim = axis.size();
std::vector<int> xdim(ndim);
std::vector<int> xstride(ndim);
std::vector<int> xout(ndim);
for (int i = 0; i < ndim; i++) {
int j = ndim - 1 - i;
xdim[j] = input_x_dims[axis[i]];
xstride[j] = 1;
for (int k = axis[i] + 1; k < ndim; k++) {
xstride[j] *= input_x_dims[k];
}
xout[j] = xstride[j] * xdim[j];
}
auto numel = input_x->numel();
size_t pind = 0;
std::vector<int> ind(ndim);
for (int i = 0; i < numel; i++) {
out_data[i] = input_x_data[pind];
ind[0]++;
pind += xstride[0];
for (int j = 0; j < ndim - 1; j++) {
if (ind[j] == xdim[j]) {
ind[j + 1]++;
ind[j] = 0;
pind += xstride[j + 1];
pind -= xout[j];
} else {
break;
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -21,23 +21,6 @@ limitations under the License. */ ...@@ -21,23 +21,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
// vector<int> pos;
// template <typename T>
// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
// const vector<int> old_strides, const vector<int>
// new_strides, T* output) {
// for (int i = 0; i < numel; ++i) {
// int old_idx = 0;
// int idx = i;
// for (int j = 0; j < axis.size(); ++j) {
// int order = axis[j];
// old_idx += (idx / new_strides[j]) * old_strides[order];
// idx %= new_strides[j];
// }
// output[i] = input[old_idx];
// }
// }
template <typename P> template <typename P>
void TransposeCompute(const TransposeParam<CPU>& param) { void TransposeCompute(const TransposeParam<CPU>& param) {
const auto* input_x = param.InputX(); const auto* input_x = param.InputX();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionDequantAddBNKernel
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNParam<DeviceType>> {
public:
void Compute(const FusionDequantAddBNParam<DeviceType> &param);
bool Init(FusionDequantAddBNParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -20,26 +20,37 @@ limitations under the License. */ ...@@ -20,26 +20,37 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
#define DECLARE_KERNEL(KernelClass, KernelParam) \
template <typename DeviceType, typename T> \
class KernelClass \
: public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
public: \
bool Init(KernelParam<DeviceType> *param); \
void Compute(const KernelParam<DeviceType> &param); \
};
#ifdef FUSION_DEQUANT_BN_OP
DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP #ifdef FUSION_DEQUANT_BN_RELU_OP
template <typename DeviceType, typename T> DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
class FusionDequantBNReluKernel #endif
: public framework::OpKernelBase<DeviceType,
FusionDequantBNReluParam<DeviceType>> { #ifdef FUSION_DEQUANT_ADD_BN_OP
public: DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
void Compute(const FusionDequantBNReluParam<DeviceType> &param);
bool Init(FusionDequantBNReluParam<DeviceType> *param);
};
#endif #endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP #ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template <typename DeviceType, typename T> DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
class FusionDequantAddBNReluKernel #endif
: public framework::OpKernelBase<DeviceType,
FusionDequantAddBNReluParam<DeviceType>> { #ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
public: DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
void Compute(const FusionDequantAddBNReluParam<DeviceType> &param); #endif
bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
}; #ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
#endif #endif
} // namespace operators } // namespace operators
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FeedKernel class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> { : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -30,6 +29,15 @@ class ReluKernel ...@@ -30,6 +29,15 @@ class ReluKernel
void Compute(const ReluParam<DeviceType>& param); void Compute(const ReluParam<DeviceType>& param);
bool Init(ReluParam<DeviceType>* param); bool Init(ReluParam<DeviceType>* param);
}; };
template <typename DeviceType, typename T>
class Relu6Kernel
: public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
public:
void Compute(const ReluParam<DeviceType>& param);
bool Init(ReluParam<DeviceType>* param);
};
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include <algorithm>
#include <cmath>
#include <string> #include <string>
#include "common/enforce.h" #include "common/enforce.h"
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "operators/math/math_func_neon.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -24,68 +32,92 @@ namespace math { ...@@ -24,68 +32,92 @@ namespace math {
#define SIGMOID_THRESHOLD_MAX 13.0 #define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0 #define EXP_MAX_INPUT 40.0
enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};
inline ActivationType GetActivationType(const std::string &type) { inline ActivationType GetActivationType(const std::string &type) {
if (type == "sigmoid") { if (type == "sigmoid") {
return ActivationType::kSigmoid; return ActivationType::SIGMOID;
} else if (type == "relu") { } else if (type == "relu") {
return ActivationType::kReLU; return ActivationType::RELU;
} else if (type == "tanh") { } else if (type == "tanh") {
return ActivationType::kTanh; return ActivationType::TANH;
} else if (type == "identity" || type == "") { } else if (type == "identity" || type == "") {
return ActivationType::kIdentity; return ActivationType::IDENTITY;
} }
PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type."); PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
} }
namespace forward { #if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <ActivationType Act = IDENTITY>
inline float32x4_t vActiveq_f32(const float32x4_t &x) {
return x;
}
template <typename T> template <>
T Identity(const T a) { inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
return a; float32x4_t __zero = vdupq_n_f32(0.f);
return vmaxq_f32(x, __zero);
} }
template <typename T> template <>
T Relu(const T a) { inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0); float32x4_t __zero = vdupq_n_f32(0.f);
float32x4_t __six = vdupq_n_f32(6.f);
return vminq_f32(vmaxq_f32(x, __zero), __six);
} }
template <typename T> template <>
T Sigmoid(const T a) { inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
const T min = SIGMOID_THRESHOLD_MIN; float32x4_t __one = vdupq_n_f32(1.f);
const T max = SIGMOID_THRESHOLD_MAX; float32x4_t __x = vnegq_f32(x);
T tmp = (a < min) ? min : ((a > max) ? max : a); __x = exp_ps(__x);
return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp)); __x = vaddq_f32(__x, __one);
float32x4_t __out = vrecpeq_f32(__x);
return vmulq_f32(vrecpsq_f32(__x, __out), __out);
} }
template <typename T> template <>
T Tanh(const T a) { inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
T tmp = -2.0 * a; float32x4_t __one = vdupq_n_f32(1.f);
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; float32x4_t __x = vnegq_f32(x);
return (2.0 / (1.0 + exp(tmp))) - 1.0; __x = vmulq_n_f32(__x, 2.f);
__x = exp_ps(__x);
__x = vaddq_f32(__x, __one);
float32x4_t __out = vrecpeq_f32(__x);
__out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
__out = vmulq_n_f32(__out, 2.f);
return vsubq_f32(__out, __one);
} }
#endif
} // namespace forward template <ActivationType Act = IDENTITY>
inline float Active(const float &x) {
return x;
}
template <typename T> template <>
struct Active { inline float Active<RELU>(const float &x) {
typedef T (*Act)(T); return std::max(x, 0.f);
}; }
static Active<float>::Act kActFloat[] = { template <>
&forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>, inline float Active<RELU6>(const float &x) {
&forward::Identity<float>}; return std::min(std::max(x, 0.f), 6.f);
}
namespace forward { template <>
inline float activation(float a, int index) { return kActFloat[index](a); } inline float Active<SIGMOID>(const float &x) {
// float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
// tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
// return 1.f / (1.f + exp(-tmp));
return 1.f / (1.f + exp(-x));
}
} // namespace forward template <>
inline float Active<TANH>(const float &x) {
// float tmp = -2.f * x;
// tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
// return (2.f / (1.f + exp(tmp))) - 1.f;
return 2.f / (1.f + exp(-2.f * x)) - 1.f;
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
此差异已折叠。
...@@ -105,16 +105,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -105,16 +105,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
/*
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu); bool relu);
/*
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias); *C, int ldc, bool relu, float *new_scale, float *new_bias);
*/ */
// 计算一个更小的 C 矩阵分块 // 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
...@@ -149,7 +148,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -149,7 +148,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1); float *new_scale, float *new_bias, float *bias1);
/*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc); void VecWriteBasic(int n, float *c, float *C, int ldc);
...@@ -159,13 +157,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -159,13 +157,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc); void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C) /*
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, // C = A * B, batchnorm(C)
float *new_bias); void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C) float *new_bias);
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, // C = A * B, batchnorm(C), relu(C)
float *new_bias); void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
*/ *new_scale, float *new_bias);
*/
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
...@@ -392,7 +391,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, ...@@ -392,7 +391,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
packedB_int8 = static_cast<int8_t *>( packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__ #if __aarch64__
// TODO() // TODO(paddle mobile)
#else #else
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8); PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
#endif #endif
...@@ -414,7 +413,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, ...@@ -414,7 +413,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
packedA_int8 = static_cast<int8_t *>( packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__ #if __aarch64__
// TODO() // TODO(paddle mobile)
#else #else
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8); PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
#endif #endif
...@@ -438,7 +437,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, ...@@ -438,7 +437,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
int8_t *local_A = packedA_int8 + MC * KC * local_threads; int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads; int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__ #if __aarch64__
// TODO() // TODO(paddle mobile)
#else #else
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A); PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif #endif
...@@ -468,7 +467,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, ...@@ -468,7 +467,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
int8_t *local_B = packedB_int8 + KC * NC * local_threads; int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads; int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__ #if __aarch64__
// TODO() // TODO(paddle mobile)
#else #else
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B); PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
#endif #endif
......
...@@ -11,13 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,13 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef GRU_OP #ifdef GRU_OP
#include "operators/math/gru_compute.h" #include "operators/math/gru_compute.h"
#include "common/types.h" #include "common/types.h"
#include "operators/math/activation_functions.h" #include "operators/math/activation.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
#include "operators/math/gru_cpu_kernel.h" #include "operators/math/gru_cpu_kernel.h"
#include "operators/math/gru_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -43,8 +44,7 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -43,8 +44,7 @@ struct GRUUnitFunctor<CPU, T> {
#endif #endif
} }
forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size, forward_reset_output(value, frame_size, batch_size, active_gate);
batch_size, active_gate);
if (value.prev_out_value) { if (value.prev_out_value) {
#ifdef _OPENMP #ifdef _OPENMP
...@@ -60,8 +60,7 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -60,8 +60,7 @@ struct GRUUnitFunctor<CPU, T> {
#endif #endif
} }
forward_final_output(forward::gru_finalOutput<T>(), value, frame_size, forward_final_output(value, frame_size, batch_size, active_node);
batch_size, active_node);
} }
}; };
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#ifdef GRU_OP #ifdef GRU_OP
#pragma once #pragma once
#include "operators/math/activation_functions.h" #include "operators/math/activation.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -11,21 +11,22 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,21 +11,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef GRU_OP #ifdef GRU_OP
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include "operators/math/activation_functions.h" #include "operators/math/activation.h"
#include "operators/math/gru_compute.h" #include "operators/math/gru_compute.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <class OpResetOutput, typename T> template <typename T, ActivationType Act>
void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
T *gate_value, T *reset_output_value, T *prev_output_value, int frame_size) {
T *prev_output_value, int frame_size,
ActivationType active_gate) {
T r_value_update_gate; T r_value_update_gate;
T r_value_reset_gate; T r_value_reset_gate;
T r_value_reset_output; T r_value_reset_output;
...@@ -33,27 +34,57 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -33,27 +34,57 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *update_gate = gate_value; T *update_gate = gate_value;
T *reset_gate = gate_value + frame_size; T *reset_gate = gate_value + frame_size;
for (int i = 0; i < frame_size; i++) { int remain = frame_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain >> 3;
remain = remain & 0x7;
float32x4_t prev0 = vdupq_n_f32(0.f);
float32x4_t prev1 = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t update0 = vld1q_f32(update_gate);
float32x4_t update1 = vld1q_f32(update_gate + 4);
float32x4_t reset0 = vld1q_f32(reset_gate);
float32x4_t reset1 = vld1q_f32(reset_gate + 4);
if (prev_output_value) {
prev0 = vld1q_f32(prev_output_value);
prev1 = vld1q_f32(prev_output_value + 4);
prev_output_value += 8;
}
update0 = vActiveq_f32<Act>(update0);
update1 = vActiveq_f32<Act>(update1);
reset0 = vActiveq_f32<Act>(reset0);
reset1 = vActiveq_f32<Act>(reset1);
float32x4_t output0 = vmulq_f32(prev0, reset0);
float32x4_t output1 = vmulq_f32(prev1, reset1);
vst1q_f32(update_gate, update0);
vst1q_f32(update_gate + 4, update1);
vst1q_f32(reset_gate, reset0);
vst1q_f32(reset_gate + 4, reset1);
vst1q_f32(reset_output_value, output0);
vst1q_f32(reset_output_value + 4, output1);
update_gate += 8;
reset_gate += 8;
reset_output_value += 8;
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_reset_gate = reset_gate[i]; r_value_reset_gate = reset_gate[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = prev_output_value[i]; r_prev_out = prev_output_value[i];
} }
r_value_update_gate = Active<Act>(r_value_update_gate);
op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, r_value_reset_gate = Active<Act>(r_value_reset_gate);
&r_value_reset_output, active_gate); r_value_reset_output = r_prev_out * r_value_reset_gate;
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
reset_output_value[i] = r_value_reset_output; reset_output_value[i] = r_value_reset_output;
} }
} }
template <class OpFinalOutput, typename T> template <typename T, ActivationType Act>
void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
T *gate_value, T *prev_output_value, T *output_value, int frame_size) {
T *output_value, int frame_size,
ActivationType active_node) {
T r_value_update_gate; T r_value_update_gate;
T r_value_frame_state; T r_value_frame_state;
T r_prev_out = 0; T r_prev_out = 0;
...@@ -61,30 +92,73 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -61,30 +92,73 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *update_gate = gate_value; T *update_gate = gate_value;
T *frame_state = gate_value + frame_size * 2; T *frame_state = gate_value + frame_size * 2;
for (int i = 0; i < frame_size; i++) { int remain = frame_size;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = remain >> 3;
remain = remain & 0x7;
float32x4_t prev0 = vdupq_n_f32(0.f);
float32x4_t prev1 = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i) {
float32x4_t update0 = vld1q_f32(update_gate);
float32x4_t update1 = vld1q_f32(update_gate + 4);
float32x4_t state0 = vld1q_f32(frame_state);
float32x4_t state1 = vld1q_f32(frame_state + 4);
if (prev_output_value) {
prev0 = vld1q_f32(prev_output_value);
prev1 = vld1q_f32(prev_output_value + 4);
prev_output_value += 8;
}
state0 = vActiveq_f32<Act>(state0);
state1 = vActiveq_f32<Act>(state1);
float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
output0 = vmlaq_f32(output0, update0, state0);
output1 = vmlaq_f32(output1, update1, state1);
vst1q_f32(frame_state, state0);
vst1q_f32(frame_state + 4, state1);
vst1q_f32(output_value, output0);
vst1q_f32(output_value + 4, output1);
update_gate += 8;
frame_state += 8;
output_value += 8;
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_frame_state = frame_state[i]; r_value_frame_state = frame_state[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = prev_output_value[i]; r_prev_out = prev_output_value[i];
} }
r_value_frame_state = Active<Act>(r_value_frame_state);
op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, r_output = r_prev_out - r_value_update_gate * r_prev_out +
&r_output, active_node); r_value_update_gate * r_value_frame_state;
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
output_value[i] = r_output; output_value[i] = r_output;
} }
} }
template <class OpResetOutput, typename T> #define FORWARD_RESET_OUTPUT(active_type, value, frame_size) \
inline void forward_reset_output(OpResetOutput op_reset_output, hl_naive_gru_forward_reset_output<float, active_type>( \
GRUMetaValue<T> value, int frame_size, value.gate_value, value.reset_output_value, value.prev_out_value, \
int batch_size, ActivationType active_gate) { frame_size);
for (int b = 0; b < batch_size; b++) {
hl_naive_gru_forward_reset_output(
op_reset_output, value.gate_value, value.reset_output_value,
value.prev_out_value, frame_size, active_gate);
template <typename T>
inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; ++b) {
switch (active_node) {
case RELU:
FORWARD_RESET_OUTPUT(RELU, value, frame_size);
break;
case SIGMOID:
FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
break;
case TANH:
FORWARD_RESET_OUTPUT(TANH, value, frame_size);
break;
default:
FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
}
value.gate_value += frame_size * 3; value.gate_value += frame_size * 3;
value.reset_output_value += frame_size; value.reset_output_value += frame_size;
if (value.prev_out_value) { if (value.prev_out_value) {
...@@ -93,15 +167,27 @@ inline void forward_reset_output(OpResetOutput op_reset_output, ...@@ -93,15 +167,27 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
} }
} }
template <class OpFinalOutput, typename T> #define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
inline void forward_final_output(OpFinalOutput op_final_output, hl_naive_gru_forward_final_output<float, active_type>( \
GRUMetaValue<T> value, int frame_size, value.gate_value, value.prev_out_value, value.output_value, frame_size)
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; b++) {
hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
value.prev_out_value, value.output_value,
frame_size, active_node);
template <typename T>
inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; ++b) {
switch (active_node) {
case RELU:
FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
break;
case SIGMOID:
FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
break;
case TANH:
FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
break;
default:
FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
}
value.gate_value += frame_size * 3; value.gate_value += frame_size * 3;
value.output_value += frame_size; value.output_value += frame_size;
if (value.prev_out_value) { if (value.prev_out_value) {
...@@ -113,4 +199,5 @@ inline void forward_final_output(OpFinalOutput op_final_output, ...@@ -113,4 +199,5 @@ inline void forward_final_output(OpFinalOutput op_final_output,
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRU_OP
#pragma once
#include <type_traits>
#include "operators/math/activation_functions.h"
namespace paddle_mobile {
namespace operators {
namespace math {
namespace forward {
template <typename T>
class gru_resetOutput {
public:
void operator()(T *value_update_gate, T *value_reset_gate, T *prev_out,
T *value_reset_output, ActivationType act_gate) {
*value_update_gate = activation(*value_update_gate, act_gate);
*value_reset_gate = activation(*value_reset_gate, act_gate);
*value_reset_output = (*prev_out) * (*value_reset_gate);
}
};
template <typename T>
class gru_finalOutput {
public:
void operator()(T *value_update_gate, T *value_frame_state, T *prev_out,
T *value_output, ActivationType act_input) {
*value_frame_state = activation(*value_frame_state, act_input);
*value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
((*value_update_gate) * (*value_frame_state));
}
};
} // namespace forward
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -41,10 +41,10 @@ void set_constant(framework::Tensor *tensor, float value) { ...@@ -41,10 +41,10 @@ void set_constant(framework::Tensor *tensor, float value) {
} }
template <> template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float beta, bool relu, float alpha, framework::Tensor *matrix_out,
float *bias) { float beta, bool relu, float *bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
......
...@@ -24,24 +24,24 @@ namespace math { ...@@ -24,24 +24,24 @@ namespace math {
void set_constant(framework::Tensor *tensor, float value); void set_constant(framework::Tensor *tensor, float value);
template <typename T> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, T beta, bool relu = false, framework::Tensor *matrix_out, float beta, bool relu = false,
float *bias = nullptr); Otype *bias = nullptr);
template <typename T, typename S> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, T beta, bool relu = false, framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
S *bias = nullptr, bool addOnRow = false); bool addOnRow);
template <typename T> template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, T beta, bool relu, framework::Tensor *matrix_out, float beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias, framework::Tensor *new_scale, framework::Tensor *new_bias,
int group, float *bias = nullptr); int group, T *bias = nullptr);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
......
...@@ -22,10 +22,11 @@ namespace operators { ...@@ -22,10 +22,11 @@ namespace operators {
namespace math { namespace math {
template <> template <>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias, float alpha, framework::Tensor *matrix_out,
bool addOnRow) { float beta, bool relu, int32_t *bias,
bool addOnRow) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -93,6 +94,16 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, ...@@ -93,6 +94,16 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#endif #endif
} }
} }
template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias) {
matmul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
matrix_out, beta, relu, bias, false);
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "operators/math/pool_2x2.h"
#include <algorithm>
#include <vector>
namespace paddle_mobile {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int w1 = input_width / 16;
int _w1 = input_width % 16;
int w2 = _w1 / 4;
int _w2 = _w1 % 4;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < input_height; ph += 2) {
const float *in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width;
const float *in_ptr2 = in_ptr1 + input_width;
if (ph != input_height && ph + 1 >= input_height) {
in_ptr2 = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * input_width));
memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
sizeof(float) * input_width);
}
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
#if __ARM_NEON
#if __aarch64__
#else
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vld1.f32 {q0, q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2, q3}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q6, q7}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q8, q9}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q2 \n\t"
"vmax.f32 q1, q1, q3 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q9 \n\t"
"vpmax.f32 d8, d0, d1 \n\t"
"vpmax.f32 d9, d2, d3 \n\t"
"vpmax.f32 d10, d12, d13 \n\t"
"vpmax.f32 d11, d14, d15 \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q0}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q1}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q1 \n\t"
"vpmax.f32 d4, d0, d1 \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9");
#endif
#endif
if (_w2 != 0) {
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
} else if (_w2 == 2) {
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
*out_ptr = (temp > temp1) ? temp : temp1;
} else if (_w2 == 3) {
float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
in_ptr1++;
in_ptr2++;
*out_ptr = (temp > temp1) ? temp : temp1;
out_ptr++;
*out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
}
}
}
}
}
}
void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int w1 = input_width / 16;
int _w1 = input_width % 16;
int w2 = _w1 / 4;
int _w2 = _w1 % 4;
float quarter = 0.25;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < input_height; ph += 2) {
const float *in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width;
const float *in_ptr2 = in_ptr1 + input_width;
if (ph + 1 >= input_height) {
in_ptr2 = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * input_width));
memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
sizeof(float) * input_width);
}
float *out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width;
#if __ARM_NEON
#if __aarch64__
#else
asm volatile(
"subs %[w1], %[w1], #1 \n\t"
"blt end_w1_%= \n\t"
"loop_w1_%=: \n\t"
"pld [%[in_ptr1], #64] \n\t"
"pld [%[in_ptr2], #64] \n\t"
"vmov.f32 d0[0], %[quarter] \n\t"
"vld1.f32 {q1, q2}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q3, q4}, [%[in_ptr2]]! \n\t"
"vld1.f32 {q7, q8}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q9, q10}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q3 \n\t"
"vadd.f32 q2, q2, q4 \n\t"
"vadd.f32 q7, q7, q9 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vpadd.f32 d10, d2, d3 \n\t"
"vpadd.f32 d11, d4, d5 \n\t"
"vpadd.f32 d12, d14, d15 \n\t"
"vpadd.f32 d13, d16, d17 \n\t"
"vmul.f32 q5, q5, d0[0] \n\t"
"vmul.f32 q6, q6, d0[0] \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"subs %[w1], %[w1], #1 \n\t"
"bge loop_w1_%= \n\t"
"end_w1_%=: \n\t"
"subs %[w2], %[w2], #1 \n\t"
"blt end_w2_%= \n\t"
"loop_w2_%=: \n\t"
"vld1.f32 {q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2}, [%[in_ptr2]]! \n\t"
"vadd.f32 q1, q1, q2 \n\t"
"vpadd.f32 d4, d2, d3 \n\t"
"vmul.f32 d4, d4, d0[0] \n\t"
"vst1.32 {d4}, [%[out_ptr]]! \n\t"
"subs %[w2], %[w2], #1 \n\t"
"bge loop_w2_%= \n\t"
"end_w2_%=: \n\t"
:
: [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
[in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
[quarter] "r"(quarter)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10");
#endif
#endif
if (_w2 != 0) {
in_ptr1 = input_data + i * input_batch_stride +
c * input_channel_stride + ph * input_width + 16 * w1 +
4 * w2;
in_ptr2 = in_ptr1 + input_width;
out_ptr = output_data + i * output_batch_stride +
c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
2 * w2;
if (_w2 == 1) {
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
} else if (_w2 == 2) {
float temp = 0;
temp += *in_ptr1;
temp += *in_ptr2;
in_ptr1++;
in_ptr2++;
temp += *in_ptr1;
temp += *in_ptr2;
*out_ptr = 0.25 * temp;
} else if (_w2 == 3) {
float temp = 0;
temp += *in_ptr1++;
temp += *in_ptr2++;
temp += *in_ptr1++;
temp += *in_ptr2++;
*out_ptr = 0.25 * temp;
out_ptr++;
*out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
}
}
}
}
}
}
//}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "framework/tensor.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::vector;
void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
const Tensor *input, Tensor *output);
void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
const Tensor *in_x, Tensor *out);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -22,10 +22,7 @@ namespace operators { ...@@ -22,10 +22,7 @@ namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void QuantizeOp<DeviceType, T>::InferShape() const { void QuantizeOp<DeviceType, T>::InferShape() const {
auto input_dims = this->param_.input_->dims(); const auto &input_dims = this->param_.input_->dims();
const std::vector<int> &paddings = this->param_.paddings_;
input_dims[2] += 2 * paddings[0];
input_dims[3] += 2 * paddings[1];
this->param_.output_->Resize(input_dims); this->param_.output_->Resize(input_dims);
auto scale_dims = framework::make_ddim(std::vector<int>{1}); auto scale_dims = framework::make_ddim(std::vector<int>{1});
this->param_.online_scale_->Resize(scale_dims); this->param_.online_scale_->Resize(scale_dims);
......
此差异已折叠。
此差异已折叠。
...@@ -324,10 +324,6 @@ if (NOT FOUND_MATCH) ...@@ -324,10 +324,6 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile) target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile) target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册