提交 59bc1423 编写于 作者: xiebaiyuan's avatar xiebaiyuan 提交者: GitHub

Merge pull request #1334 from wzzju/add_pooling_int8

Add pooling int8
...@@ -24,6 +24,7 @@ const char *G_OP_TYPE_CONCAT = "concat"; ...@@ -24,6 +24,7 @@ const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant"; const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
...@@ -115,6 +116,7 @@ std::unordered_map< ...@@ -115,6 +116,7 @@ std::unordered_map<
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
......
...@@ -99,6 +99,7 @@ extern const char *G_OP_TYPE_BOX_CODER; ...@@ -99,6 +99,7 @@ extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT; extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD; extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC; extern const char *G_OP_TYPE_FC;
......
...@@ -98,6 +98,24 @@ class OpRegistry { ...@@ -98,6 +98,24 @@ class OpRegistry {
} }
}; };
#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
template class op_class<device_type, int8_t>; \
template <typename Dtype, typename T> \
class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \
}; \
static paddle_mobile::framework::OperatorRegistrar< \
device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
__op_registrar_##op_type##_##device_name(#op_type); \
int TouchOpRegistrar_##op_type##_##device_name() { \
__op_registrar_##op_type##_##device_name.Touch(); \
return 0; \
}
#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \ #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template class op_class<device_type, float>; \ template class op_class<device_type, float>; \
template <typename Dtype, typename T> \ template <typename Dtype, typename T> \
......
...@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { ...@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
paddle_mobile::operators::math::Gemm gemm; paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb, gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
static_cast<float>(0), c, ldc, false, nullptr); static_cast<float>(0), c, ldc, false,
static_cast<float *>(nullptr));
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
double cost = paddle_mobile::time_diff(time1, time2); double cost = paddle_mobile::time_diff(time1, time2);
paddle_mobile::memory::Free(a); paddle_mobile::memory::Free(a);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include "operators/fusion_conv_add_relu_int8_op.h"
#include <vector>
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
ops::FusionConvAddReluInt8Op);
#endif
#endif // FUSION_CONVADDRELU_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
...@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) { ...@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
template <> template <>
void ConvAddReluKernel<CPU, float>::Compute( void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam<CPU> &param) { const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<float>(param); ConvAddReluCompute<float, float>(param);
} }
template class ConvAddReluKernel<CPU, float>; template class ConvAddReluKernel<CPU, float>;
#ifdef FUSION_CONVADDRELU_INT8_OP
template <>
bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddReluKernel<CPU, int8_t>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<int8_t, int32_t>(param);
}
template class ConvAddReluKernel<CPU, int8_t>;
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -25,21 +25,31 @@ limitations under the License. */ ...@@ -25,21 +25,31 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename P> template <typename P, typename S>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
int axis = param.Axis(); int32_t axis = param.Axis();
S *bias_data = bias.data<S>();
Tensor *output = param.Output(); Tensor *output = param.Output();
float *biase_data = bias.data<float>(); output->mutable_data<P>();
int groups = param.Groups(); float alpha = 1.0f;
std::vector<int> strides = param.Strides(); float beta = 1.0f;
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]); #ifdef FUSION_CONVADDRELU_INT8_OP
Tensor scale = *param.InputScale();
alpha = scale.data<float>()[0];
beta = 0.0f;
#endif
int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
std::vector<int32_t> dilations = param.Dilations();
const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims())); std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
...@@ -61,13 +71,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -61,13 +71,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor col; Tensor col;
Tensor col_matrix; Tensor col_matrix;
if (is_expand) { if (is_expand) {
col.mutable_data<float>(col_shape); col.mutable_data<P>(col_shape);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
framework::DDim input_shape = framework::slice_ddim( framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size())); input->dims(), 1, static_cast<int32_t>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
...@@ -77,17 +87,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -77,17 +87,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
output->numel() / (output->dims()[0] * output->dims()[1])}; output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm // convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups; int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups; int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col; math::Vol2ColFunctor<CPU, P> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col; math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
for (int i = 0; i < batch_size; i++) { for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) { for (int32_t g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) { if (!is_expand) {
...@@ -97,7 +107,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -97,7 +107,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
} else if (data_dim == 2U) { } else if (data_dim == 2U) {
// im2col // im2col
im2col(in_slice, dilations, strides, im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0], std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
paddings[1]}, paddings[1]},
&col); &col);
} else if (data_dim == 3U) { } else if (data_dim == 3U) {
...@@ -108,9 +118,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -108,9 +118,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
static_cast<float>(1), true, biase_data); beta, true, bias_data);
} }
} }
} }
......
...@@ -106,10 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) { ...@@ -106,10 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false,
math::matmul<Itype>(filter_slice, false, col_matrix, false, static_cast<float>(1), &out_slice, static_cast<float>(0),
static_cast<float>(1), &out_slice, false, static_cast<Otype *>(nullptr));
static_cast<float>(0));
} }
} }
} }
......
...@@ -73,8 +73,9 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -73,8 +73,9 @@ void MulCompute(const MulParam<CPU> &param) {
} }
if (param.InputX()->type() == typeid(int8_t)) { if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>(); out->mutable_data<int32_t>();
math::matmul<int8_t>(x_matrix, false, y_matrix, false, math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
static_cast<int8_t>(1), out, static_cast<int8_t>(0)); static_cast<float>(1), out,
static_cast<float>(0));
} else { } else {
out->mutable_data<float>(); out->mutable_data<float>();
......
...@@ -23,20 +23,22 @@ namespace paddle_mobile { ...@@ -23,20 +23,22 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
inline void PoolBasic(std::string pooling_type, std::vector<int> ksize, template <typename T, typename S>
void PoolBasic(std::string pooling_type, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings, std::vector<int> strides, std::vector<int> paddings,
const Tensor *in_x, Tensor *out) { const Tensor *in_x, Tensor *out) {
if (pooling_type == "max") { if (pooling_type == "max") {
math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward; math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
math::MaxPool<float> pool_process; math::MaxPool<T> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward; math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
math::AvgPool<float> pool_process; math::AvgPool<T, S> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
} }
} }
template <typename P> template <typename P>
void PoolCompute(const PoolParam<CPU> &param) { void PoolCompute(const PoolParam<CPU> &param) {
const Tensor *in_x = param.Input(); const Tensor *in_x = param.Input();
...@@ -52,13 +54,26 @@ void PoolCompute(const PoolParam<CPU> &param) { ...@@ -52,13 +54,26 @@ void PoolCompute(const PoolParam<CPU> &param) {
LOG(paddle_mobile::LogLevel::kLOG_ERROR) LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "Pool op only supports 2D and 3D input."; << "Pool op only supports 2D and 3D input.";
} }
if (param.isGlobalPooling()) { if (param.isGlobalPooling()) {
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0; paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]); ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
} }
} }
if (in_x->type() == typeid(int8_t)) {
if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
if (strides[0] == strides[1] && strides[0] == 1) {
math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
} else if (strides[0] == strides[1] && strides[0] == 2) {
math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
} else {
math::Pool3x3Max_int8(strides, paddings, in_x, out);
}
} else {
PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
out);
}
} else {
if (ksize[0] == 3 && ksize[0] == ksize[1]) { if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max") { if (pooling_type == "max") {
if (strides[0] == strides[1] && strides[0] == 1 && if (strides[0] == strides[1] && strides[0] == 1 &&
...@@ -81,7 +96,8 @@ void PoolCompute(const PoolParam<CPU> &param) { ...@@ -81,7 +96,8 @@ void PoolCompute(const PoolParam<CPU> &param) {
paddings[1] == 0) { paddings[1] == 0) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#else #else
/// todo: fix bug in Pool2x2 /// todo: fix bug in Pool2x2
if (pooling_type == "max") { if (pooling_type == "max") {
...@@ -91,11 +107,14 @@ void PoolCompute(const PoolParam<CPU> &param) { ...@@ -91,11 +107,14 @@ void PoolCompute(const PoolParam<CPU> &param) {
} }
#endif #endif
#else #else
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#endif // __ARM_NEON #endif // __ARM_NEON
} else { } else {
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
}
} }
} }
......
...@@ -2924,6 +2924,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2924,6 +2924,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __ARM_NEON #endif // __ARM_NEON
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
template <>
void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) { bool relu, float *bias) {
...@@ -3146,6 +3147,7 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda, ...@@ -3146,6 +3147,7 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
} }
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
template <>
void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) { bool relu, float *bias) {
......
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "common/log.h" #include "common/log.h"
#include "memory/t_malloc.h"
#ifdef _OPENMP
#include <omp.h>
#endif
// 矩阵取值运算宏,假设矩阵按行存储 // 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)] #define A(i, j) A[(i)*lda + (j)]
...@@ -23,10 +27,12 @@ limitations under the License. */ ...@@ -23,10 +27,12 @@ limitations under the License. */
#if __aarch64__ #if __aarch64__
#define MR_INT8 4 #define MR_INT8 4
#define NR_INT8 2
#define MR 6 #define MR 6
#define NR 16 #define NR 16
#else #else
#define MR_INT8 4 #define MR_INT8 4
#define NR_INT8 2
#define MR 6 #define MR 6
#define NR 8 #define NR 8
#endif #endif
...@@ -161,11 +167,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -161,11 +167,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_bias); float *new_bias);
*/ */
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
...@@ -174,11 +175,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -174,11 +175,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float *C,
...@@ -193,52 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -193,52 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits int small block inner product // 8 bits int small block inner product
void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc); int32_t ldc);
void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc); int32_t ldc);
// 8 bits int inner product // 8 bits int inner product
void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, template <typename Otype>
const int8_t *a, const int8_t *b, int8_t beta, void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
int32_t *c, int32_t *C, int32_t ldc, bool relu, const int8_t *b, float beta, int32_t *c, Otype *C,
int8_t *bias); int32_t ldc, bool relu);
template <typename Otype>
void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
// 8 bits int pack function // 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer); int32_t lda, int8_t *buffer);
void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer); int32_t lda, int8_t *buffer);
void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer); int32_t ldb, int8_t *buffer);
void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer); int32_t lda, int8_t *buffer);
void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer); int32_t ldb, int8_t *buffer);
void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer);
void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer);
// 8 bits int matrix product // 8 bits int matrix product
void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, template <typename Itype, typename Btype, typename Otype>
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t ldc, bool relu, int8_t *bias); int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t ldc, bool relu, Btype *bias);
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, template <typename Otype>
int32_t *C, int32_t ldc, bool relu, int8_t *bias); void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias);
template <typename Itype, typename Btype, typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
template <typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
// 8 bits int write back // 8 bits int write back
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B // C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc); void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + C // C = A * B + bias, scale * relu(C)
void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C, void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc); int32_t ldc, int32_t *bias, float scale);
// C = A * B + bias // C = A * B + bias, scale * C
void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int8_t *bias); int32_t ldc, int32_t *bias, float scale);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias, relu(C)
void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
private: private:
int MC = 0; int MC = 0;
...@@ -254,10 +265,200 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -254,10 +265,200 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits int // 8 bits int
int8_t *packedA_int8; int8_t *packedA_int8;
int8_t *packedB_int8; int8_t *packedB_int8;
int32_t *packedC_int8; int32_t *packedC_int32;
int8_t *zero_int8; int8_t *zero_int8;
}; };
// 8 bits int matrix product (m*k x k*n)
template <typename Otype>
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int matrix product (m*k x k*n), omp version
template <typename Otype>
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
float beta, Otype *C, int32_t ldc, bool relu,
int32_t *bias) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
#else
int32_t max_threads = 1;
#endif
int32_t L1 = 64 / max_threads * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// 补齐 B
NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO()
#else
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
#endif
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO()
#else
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
#endif
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
}
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
#else
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif
if (bias == nullptr) {
InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu);
} else {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu, bias + i);
}
}
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
#else
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
#endif
if (bias == nullptr) {
InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu);
} else {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu, bias);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,10 +14,11 @@ limitations under the License. */ ...@@ -14,10 +14,11 @@ limitations under the License. */
#include <string.h> #include <string.h>
#include "common/log.h" #include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
#if __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#include <iostream>
#endif #endif
#ifdef _OPENMP #ifdef _OPENMP
#include <omp.h> #include <omp.h>
...@@ -30,7 +31,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -30,7 +31,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) { int32_t ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO()
#else #else
const int8_t *a_ptr, *b_ptr; const int8_t *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
...@@ -62,7 +63,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -62,7 +63,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"pld [%[b_ptr], #128] \n\t" "pld [%[b_ptr], #128] \n\t"
"vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols "vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows "vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows
"vmovl.s8 q2, d0 \n\t" // process B first 4 "vmovl.s8 q2, d0 \n\t" // process B first
// rows // rows
"vmovl.s8 q3, d8 \n\t" "vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t" "vmlal.s16 q8, d6, d4[0]\n\t"
...@@ -241,12 +242,141 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -241,12 +242,141 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
#endif // __ARM_NEON #endif // __ARM_NEON
} }
// The core idea of AddDot4x2 function is borrowed from the Google's gemmlowp
// open source library. The address of gemmlowp is
// https://github.com/google/gemmlowp.
void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
#define PADDLE_LABEL_LOOP "1"
#define PADDLE_LABEL_AFTER_LOOP "2"
asm volatile(
"lsl %[ldc], %[ldc], #2 \n\t" // sizeof(int32) == 4
"vldr d0, [%[b], #0] \n\t"
"vmov.s32 q8, #0 \n\t"
"vldr d4, [%[a], #0] \n\t"
"vmov.s32 q9, q8 \n\t"
"vldr d2, [%[b], #16] \n\t"
"vmov.s32 q10, q8 \n\t"
"vldr d6, [%[a], #16] \n\t"
"vmov.s32 q11, q8 \n\t"
"vldr d1, [%[b], #8]\n\t"
"vmov.s32 q12, q8 \n\t"
"vldr d5, [%[a], #8]\n"
"vmov.s32 q13, q8 \n\t"
"vldr d3, [%[b], #24]\n\t"
"vmov.s32 q14, q8 \n\t"
"vldr d7, [%[a], #24]\n"
"vmov.s32 q15, q8 \n\t"
PADDLE_LABEL_LOOP
": \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"add %[b], %[b], #32 \n\t"
"vmull.s8 q5, d2, d4 \n\t"
"vldr d4, [%[a], #32] \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vldr d6, [%[a], #48] \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vmlal.s8 q5, d3, d5 \n\t"
"vldr d5, [%[a], #40] \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vldr d7, [%[a], #56] \n\t"
"vpadal.s16 q8, q4 \n\t" // pairwise-add
"add %[a], %[a], #64 \n\t"
"vpadal.s16 q9, q5 \n\t"
"subs %[k], %[k], #16 \n\t"
"vpadal.s16 q10, q6 \n\t"
"vpadal.s16 q11, q7 \n\t"
"beq " PADDLE_LABEL_AFTER_LOOP
"f \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"vmull.s8 q5, d2, d4 \n\t"
"vldr d4, [%[a], #0] \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vldr d0, [%[b], #0] \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vldr d2, [%[b], #16] \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vldr d6, [%[a], #16] \n\t"
"vmlal.s8 q5, d3, d5 \n\t"
"vldr d5, [%[a], #8] \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vldr d1, [%[b], #8] \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vldr d3, [%[b], #24] \n\t"
"vpadal.s16 q12, q4 \n\t" // pairwise-add
"vldr d7, [%[a], #24] \n\t"
"vpadal.s16 q13, q5 \n\t"
"vpadal.s16 q14, q6 \n\t"
"vpadal.s16 q15, q7 \n\t"
"b " PADDLE_LABEL_LOOP "b \n\t"
PADDLE_LABEL_AFTER_LOOP
": \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"vmull.s8 q5, d2, d4 \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vmlal.s8 q5, d3, d5 \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vpadal.s16 q12, q4 \n\t" // pairwise-add
"vpadal.s16 q13, q5 \n\t"
"vpadal.s16 q14, q6 \n\t"
"vpadal.s16 q15, q7 \n\t"
"vpadd.s32 d0, d16, d17 \n\t" // reduce to int32
"vpadd.s32 d1, d18, d19 \n\t"
"vpadd.s32 d2, d20, d21 \n\t"
"vpadd.s32 d3, d22, d23 \n\t"
"vpadd.s32 d4, d24, d25 \n\t"
"vpadd.s32 d5, d26, d27 \n\t"
"vpadd.s32 d6, d28, d29 \n\t"
"vpadd.s32 d7, d30, d31 \n\t"
"vpadd.s32 d8, d0, d1 \n\t" // reduce to int32 again
"vpadd.s32 d9, d2, d3 \n\t"
"vpadd.s32 d10, d4, d5 \n\t"
"vpadd.s32 d11, d6, d7 \n\t"
"vst1.32 {d8}, [%[c]], %[ldc] \n\t"
"vst1.32 {d9}, [%[c]], %[ldc] \n\t"
"vst1.32 {d10}, [%[c]], %[ldc] \n\t"
"vst1.32 {d11}, [%[c]] \n\t"
: [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
: [ldc] "r"(ldc)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q11", "q12", "q13", "q14", "q15");
#undef PADDLE_LABEL_AFTER_LOOP
#undef PADDLE_LABEL_LOOP
#endif // __aarch64__
#endif // __ARM_NEON
}
// 8 bits int small block inner product // 8 bits int small block inner product
void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) { int32_t ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO
#else #else
const int8_t *a_ptr, *b_ptr; const int8_t *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
...@@ -539,51 +669,225 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -539,51 +669,225 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
} }
// 8 bits int inner product // 8 bits int inner product
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, template <>
const int8_t *a, const int8_t *b, int8_t beta, void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
int32_t *c, int32_t *C, int32_t ldc, bool relu, const int8_t *b, float beta, int32_t *c, int8_t *C,
int8_t *bias) { int32_t ldc, bool relu) {}
template <>
void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int32_t *C,
int32_t ldc, bool relu) {
#pragma omp parallel for #pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR) { for (int32_t j = 0; j < nc; j += NR_INT8) {
for (int32_t i = 0; i < mc; i += MR_INT8) { for (int32_t i = 0; i < mc; i += MR_INT8) {
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO
#else #else
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif // __aarch64__ #endif // __aarch64__
} }
} }
if (alpha != 1) { if (!relu) {
WriteWithAlphaBeta(mc, nc, c, C, ldc);
return;
}
if (beta == 0) {
WriteBasic(mc, nc, c, C, ldc); WriteBasic(mc, nc, c, C, ldc);
return; return;
} }
if (beta == 1 && !relu) { }
if (bias == nullptr) {
WriteWithAdd(mc, nc, c, C, ldc); template <>
} else { void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
WriteWithAddV1(mc, nc, c, C, ldc, bias); const int8_t *a, const int8_t *b, float beta,
int32_t *c, int8_t *C, int32_t ldc, bool relu,
int32_t *bias) {
#pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR_INT8) {
for (int32_t i = 0; i < mc; i += MR_INT8) {
#if __aarch64__
// TODO
#else
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif // __aarch64__
} }
return;
} }
if (beta == 1 && relu) { if (relu) {
if (bias == nullptr) { WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
WriteWithAddRelu(mc, nc, c, C, ldc); return;
} else { } else {
WriteWithAddReluV1(mc, nc, c, C, ldc, bias); WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
}
}
template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int32_t *bias) {}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
const int8_t *a3 = A + (i + 3) * lda;
int8_t *local_buffer = buffer + i * KC;
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (m_tail != 0) {
const int8_t *a0 = &A(i_length, 0);
const int8_t *a1 = a0 + lda;
const int8_t *a2 = a0 + 2 * lda;
const int8_t *a3 = a0 + 3 * lda;
int8_t *local_buffer = buffer + i_length * KC;
switch (m_tail) {
case 1:
a1 = zero_int8;
case 2:
a2 = zero_int8;
case 3:
a3 = zero_int8;
break;
default:
break;
}
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
} }
return;
} }
} }
// 8 bits int PackMatrixA_4r // 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) { int32_t lda, int8_t *buffer) {
const int8_t *a0, *a1, *a2, *a3; const int8_t *a0, *a1, *a2, *a3;
for (int32_t i = 0; i < m - m_tail; i += MR_INT8) { for (int32_t i = 0; i < m - m_tail; i += 4) {
a0 = A + i * lda; a0 = A + i * lda;
a1 = A + (i + 1) * lda; a1 = A + (i + 1) * lda;
a2 = A + (i + 2) * lda; a2 = A + (i + 2) * lda;
...@@ -625,7 +929,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, ...@@ -625,7 +929,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) { int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail; const int32_t i_length = m - m_tail;
for (int32_t i = 0; i < i_length; i += MR_INT8) { for (int32_t i = 0; i < i_length; i += 6) {
const int8_t *a0 = A + i * lda; const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda; const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda; const int8_t *a2 = A + (i + 2) * lda;
...@@ -676,17 +980,85 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, ...@@ -676,17 +980,85 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
} }
} }
// 8 bits int PackMatrixB
void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
for (int32_t j = 0; j < j_length; j += 2) {
int8_t *local_buffer = buffer + j * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j);
const int8_t *b1 = &B((i << 4), j + 1);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b1;
b1 += ldb;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j);
const int8_t *b1 = &B((k_count << 4), j + 1);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b1;
b1 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (n_tail != 0) {
int8_t *local_buffer = buffer + j_length * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j_length);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = 0;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j_length);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int PackMatrixB // 8 bits int PackMatrixB
void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer) { int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail; const int32_t j_length = n - n_tail;
for (int32_t j = 0; j < j_length; j += NR) { for (int32_t j = 0; j < j_length; j += 8) {
int8_t *local_buffer = buffer + j * k; int8_t *local_buffer = buffer + j * k;
for (int32_t i = 0; i < k; ++i) { for (int32_t i = 0; i < k; ++i) {
const int8_t *b0 = &B(i, j); const int8_t *b0 = &B(i, j);
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO
#else #else
asm volatile( asm volatile(
// "pld [%[b0]] \n\t" // "pld [%[b0]] \n\t"
...@@ -715,94 +1087,27 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, ...@@ -715,94 +1087,27 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
for (int32_t j = j_length; j < n; ++j) { for (int32_t j = j_length; j < n; ++j) {
*local_buffer++ = *b0++; *local_buffer++ = *b0++;
} }
for (int32_t j = n; j < j_length + NR; ++j) { for (int32_t j = n; j < j_length + 8; ++j) {
*local_buffer++ = 0; *local_buffer++ = 0;
} }
} }
} }
} }
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
KC = k;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int8 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
// PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int8, &C(i, j), ldc, relu, nullptr);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int8, &C(i, j), ldc, relu, bias + i);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int8);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int write back // 8 bits int write back
// C = alpha * A * B + beta * C // C = A * B
void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) {}
// C = A * B, 8位 int32_t
void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) { int32_t ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO
#else #else
int32_t nc1 = nc >> 4; int32_t nc1 = nc >> 4;
int32_t _nc1 = nc & 15; int32_t _nc1 = nc & 15;
int32_t step = sizeof(int32_t) * ldc; int32_t step = sizeof(int32_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4)); int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
int32_t volatile m = mc; int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile C_ptr; int32_t *volatile c_ptr, *volatile C_ptr;
int32_t *C0, *c0; int32_t *C0, *c0;
c_ptr = c; c_ptr = c;
...@@ -836,7 +1141,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, ...@@ -836,7 +1141,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
"end_mc_%=: \n\t" "end_mc_%=: \n\t"
: :
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1) [step] "r"(step), [step1] "r"(step1)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3"); : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
} }
...@@ -854,20 +1159,254 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, ...@@ -854,20 +1159,254 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
#endif // __ARM_NEON #endif // __ARM_NEON
} }
// C = A * B + C // C = A * B + bias, scale * C
void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C, void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc) {} int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t zero = 0;
int8_t narrow = -128;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"loop_mc_%=: \n\t"
"vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t"
"vdup.32 q13, d26[0] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q13 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vceq.s8 d15, d14, d24 \n\t"
"vsub.s8 d14, d14, d15 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow)
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q12", "q13", "q14", "q15");
}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t bias_v;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
bias_v = *(bias_ptr + i);
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"vdup.32 q13, %[bias_v] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
[bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero),
[narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
"q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * relu(C)
void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t zero = 0;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"loop_mc_%=: \n\t"
"vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t"
"vdup.32 q13, d26[0] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vmax.s32 q1, q1, q14 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
// C = A * B + bias "add %[C_ptr], %[C_ptr], %[step] \n\t"
void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, "add %[c_ptr], %[c_ptr], %[step1] \n\t"
int32_t ldc, int8_t *bias) {} "subs %[mc], %[mc], #1 \n\t"
// C = A * B + C, relu(C) "bge loop_mc_%= \n\t"
void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C, "end_mc_%=: \n\t"
int32_t ldc) {}
// C = A * B + bias, relu(C) :
void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
int32_t ldc, int8_t *bias) {} [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [zero] "r"(zero)
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q13", "q14", "q15");
}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t bias_v;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
bias_v = *(bias_ptr + i);
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.32 q13, %[bias_v] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
[bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -27,130 +27,17 @@ namespace paddle_mobile { ...@@ -27,130 +27,17 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
int8_t beta, int32_t *C, int32_t ldc, bool relu,
int8_t *bias) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
#else
int32_t max_threads = 1;
#endif
int32_t L1 = 64 / max_threads * 1024;
KC = k;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// 补齐 B
NC = (n + NR - 1) / NR * NR;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
#endif
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
NC = NR;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
}
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
#endif
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
}
packedC_int8 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int8 + MC * NC * local_threads;
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu, bias + i);
}
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int8 + MC * NC * local_threads;
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
#endif
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu, bias);
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int8);
paddle_mobile::memory::Free(zero_int8);
}
void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) { const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail; const int32_t j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int32_t j = 0; j < j_length; j += NR) { for (int32_t j = 0; j < j_length; j += 8) {
int8_t *local_buffer = buffer + j * k; int8_t *local_buffer = buffer + j * k;
for (int32_t i = 0; i < k; ++i) { for (int32_t i = 0; i < k; ++i) {
const int8_t *b0 = &B(i, j); const int8_t *b0 = &B(i, j);
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
// TODO(wzzju) // TODO
#else #else
asm volatile( asm volatile(
// "pld [%[b0]] \n\t" // "pld [%[b0]] \n\t"
...@@ -179,7 +66,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, ...@@ -179,7 +66,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
for (int32_t j = j_length; j < n; ++j) { for (int32_t j = j_length; j < n; ++j) {
*local_buffer++ = *b0++; *local_buffer++ = *b0++;
} }
for (int32_t j = n; j < j_length + NR; ++j) { for (int32_t j = n; j < j_length + 8; ++j) {
*local_buffer++ = 0; *local_buffer++ = 0;
} }
} }
...@@ -188,9 +75,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, ...@@ -188,9 +75,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) { const int8_t *A, int32_t lda, int8_t *buffer) {
const int i_length = m - m_tail; const int32_t i_length = m - m_tail;
#pragma omp parallel for #pragma omp parallel for
for (int32_t i = 0; i < i_length; i += MR_INT8) { for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda; const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda; const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda; const int8_t *a2 = A + (i + 2) * lda;
...@@ -221,12 +108,238 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, ...@@ -221,12 +108,238 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
default: default:
break; break;
} }
for (int j = 0; j < k; ++j) { for (int32_t j = 0; j < k; ++j) {
*local_buffer++ = *a0++;
*local_buffer++ = *a1++;
*local_buffer++ = *a2++;
*local_buffer++ = *a3++;
}
}
}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
#pragma omp parallel for
for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
const int8_t *a3 = A + (i + 3) * lda;
int8_t *local_buffer = buffer + i * KC;
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (m_tail != 0) {
const int8_t *a0 = &A(i_length, 0);
const int8_t *a1 = a0 + lda;
const int8_t *a2 = a0 + 2 * lda;
const int8_t *a3 = a0 + 3 * lda;
int8_t *local_buffer = buffer + i_length * KC;
switch (m_tail) {
case 1:
a1 = zero_int8;
case 2:
a2 = zero_int8;
case 3:
a3 = zero_int8;
break;
default:
break;
}
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++; *local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++; *local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++; *local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++; *local_buffer++ = *a3++;
} }
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int PackMatrixB
void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
#pragma omp parallel for
for (int32_t j = 0; j < j_length; j += 2) {
int8_t *local_buffer = buffer + j * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j);
const int8_t *b1 = &B((i << 4), j + 1);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b1;
b1 += ldb;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j);
const int8_t *b1 = &B((k_count << 4), j + 1);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b1;
b1 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (n_tail != 0) {
int8_t *local_buffer = buffer + j_length * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j_length);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = 0;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j_length);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < KC; ++j) {
*local_buffer++ = 0;
}
}
} }
} }
......
...@@ -34,12 +34,12 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -34,12 +34,12 @@ struct GRUUnitFunctor<CPU, T> {
gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1, gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight, value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false, frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr); static_cast<float *>(nullptr));
#else #else
gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1, gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight, value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false, frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr); static_cast<float *>(nullptr));
#endif #endif
} }
...@@ -51,12 +51,12 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -51,12 +51,12 @@ struct GRUUnitFunctor<CPU, T> {
gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1, gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight, value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2, frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr); frame_size * 3, false, static_cast<float *>(nullptr));
#else #else
gemm.Sgemm(batch_size, frame_size, frame_size, 1, gemm.Sgemm(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight, value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2, frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr); frame_size * 3, false, static_cast<float *>(nullptr));
#endif #endif
} }
......
...@@ -28,7 +28,13 @@ template <typename T> ...@@ -28,7 +28,13 @@ template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false, framework::Tensor *matrix_out, T beta, bool relu = false,
T *bias = nullptr); float *bias = nullptr);
template <typename T, typename S>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
S *bias = nullptr);
template <typename T> template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
...@@ -20,11 +20,12 @@ limitations under the License. */ ...@@ -20,11 +20,12 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <> template <>
void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b, float alpha,
int8_t alpha, framework::Tensor *matrix_out, int8_t beta, framework::Tensor *matrix_out, float beta, bool relu,
bool relu, int8_t *bias) { int32_t *bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -52,21 +53,43 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -52,21 +53,43 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
} }
#ifdef _OPENMP #ifdef _OPENMP
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
} else {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta, gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias); matrix_out->data<int32_t>(), N, relu, bias);
}
#else #else
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
} else {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta, gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias); matrix_out->data<int32_t>(), N, relu, bias);
}
#endif #endif
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
} else {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K, gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias); matrix_out->data<int32_t>(), N, relu, bias);
}
#else #else
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K, gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N, matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
relu, bias); N, relu, bias);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
N, relu, bias);
}
#endif #endif
} }
} }
......
...@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
const int input_width = static_cast<int>(input->dims()[3]); const int input_width = static_cast<int>(input->dims()[3]);
const int output_height = static_cast<int>(output->dims()[2]); const int output_height = static_cast<int>(output->dims()[2]);
const int output_width = static_cast<int>(output->dims()[3]); const int output_width = static_cast<int>(output->dims()[3]);
output->mutable_data<float>();
const int hxw = input_height * input_width; const int hxw = input_height * input_width;
...@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
const int inputdata_channel_stride = h_in * w_in; const int inputdata_channel_stride = h_in * w_in;
const int input_batch_stride = output_channels * inputdata_channel_stride; const int input_batch_stride = output_channels * inputdata_channel_stride;
const int output_batch_stride = output_channels * outputdata_channel_stride; const int output_batch_stride = output_channels * outputdata_channel_stride;
float *out_data = output->data<float>(); float *out_data = output->mutable_data<float>();
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
for (int k = 0; k < batch_size; ++k) { for (int k = 0; k < batch_size; ++k) {
#pragma omp parallel for #pragma omp parallel for
......
...@@ -28,15 +28,21 @@ limitations under the License. */ ...@@ -28,15 +28,21 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
using framework::Tensor; void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output);
using std::vector; void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output);
void Pool3x3Avgs1p1(const Tensor *input, Tensor *output); void Pool3x3Max(std::vector<int> strides, std::vector<int> paddings,
void Pool3x3Maxs1p1(const Tensor *input, Tensor *output); const framework::Tensor *input, framework::Tensor *output);
void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output); void Pool3x3Avg(std::vector<int> strides, std::vector<int> paddings,
const framework::Tensor *in_x, framework::Tensor *out);
void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
Tensor *out); void Pool3x3Maxs1_int8(const framework::Tensor *input,
framework::Tensor *output, int32_t pad_h, int32_t pad_w);
void Pool3x3Maxs2_int8(const framework::Tensor *input,
framework::Tensor *output, int32_t pad_h, int32_t pad_w);
void Pool3x3Max_int8(const std::vector<int> &strides,
const std::vector<int> &paddings,
const framework::Tensor *input, framework::Tensor *output);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
#include <iostream>
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::max;
using std::min;
using std::vector;
template <typename T>
static void make_paddings(const Tensor *input, Tensor *padded_input,
int32_t top, int32_t bottom, int32_t left,
int32_t right, T value) {
const int32_t batch_size = input->dims()[0];
const int32_t c_in = input->dims()[1];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int32_t h_padded = h_in + top + bottom;
const int32_t w_padded = w_in + left + right;
padded_input->Resize({batch_size, c_in, h_padded, w_padded});
T *padded_input_data = padded_input->mutable_data<T>();
const T *input_data = input->data<T>();
const int32_t input_channel_stride = h_in * w_in;
const int32_t input_batch_stride = c_in * input_channel_stride;
const int32_t padded_channel_stride = h_padded * w_padded;
const int32_t padded_batch_stride = c_in * padded_channel_stride;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < c_in; ++j) {
const T *img_in = input_data + j * input_channel_stride;
T *img_padded = padded_input_data + j * padded_channel_stride;
int k = 0;
for (; k < top; ++k) {
for (int l = 0; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
for (; k < top + h_in; ++k) {
int l = 0;
for (; l < left; ++l) {
img_padded[l] = value;
}
memcpy(img_padded + left, img_in, w_in * sizeof(T));
l += w_in;
img_in += w_in;
for (; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
for (; k < h_padded; ++k) {
for (int l = 0; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
}
input_data += input_batch_stride;
padded_input_data += padded_batch_stride;
}
// input_data = input->data<T>();
// std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// const T *img_in = input_data + j * input_channel_stride;
// for (int k = 0; k < h_in; ++k) {
// for (int l = 0; l < w_in; ++l) {
// std::cout << (int32_t)*img_in << "\t";
// img_in++;
// }
// std::cout << std::endl;
// }
// }
// input_data += input_batch_stride;
// }
// std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
// std::endl;
//
// padded_input_data = padded_input->data<T>();
// std::cout << "******************Padding begin**********************"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// T *img_padded = padded_input_data + j * padded_channel_stride;
// for (int k = 0; k < h_padded; ++k) {
// for (int l = 0; l < w_padded; ++l) {
// std::cout << (int32_t)*img_padded << "\t";
// img_padded++;
// }
// std::cout << std::endl;
// }
// }
// padded_input_data += padded_batch_stride;
// }
// std::cout << "******************Padding end**********************"
// << std::endl;
}
void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h,
int32_t pad_w) {
Tensor padded_input;
if (pad_h != 0 && pad_w != 0) {
int8_t value = -SCHAR_MAX;
make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
input = &padded_input;
}
const int32_t batch_size = input->dims()[0];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int8_t *input_data = input->data<int8_t>();
const int32_t output_channels = output->dims()[1];
const int32_t h_out = output->dims()[2];
const int32_t w_out = output->dims()[3];
int8_t *output_data = output->mutable_data<int8_t>();
const int32_t outputdata_channel_stride = h_out * w_out;
const int32_t inputdata_channel_stride = h_in * w_in;
const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
const int32_t output_batch_stride =
output_channels * outputdata_channel_stride;
// std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < output_channels; ++j) {
const int8_t *img_in = input_data + j * inputdata_channel_stride;
int8_t *img_out = output_data + j * outputdata_channel_stride;
for (int k = 0; k < h_out; ++k) {
const int8_t *row0 = img_in + k * w_in;
const int8_t *row1 = img_in + (k + 1) * w_in;
const int8_t *row2 = img_in + (k + 2) * w_in;
#if __ARM_NEON
int32_t nw = w_out >> 4;
int32_t left_w = w_out & 0xf;
int32_t nw1 = left_w >> 3;
int32_t left_w1 = left_w & 0x7;
#if __aarch64__
// TODO
#else
if (nw > 0) {
#define LOOP_LABEL "1"
// result: q15
asm volatile(
"vld1.8 {q0}, [%[row0]]! \n\t" // q0=0-15
"vld1.8 {q2}, [%[row1]]! \n\t"
"vld1.8 {q4}, [%[row2]]! \n\t"
LOOP_LABEL
": \n\t"
"vld1.8 {q1}, [%[row0]]! \n\t" // q1=16-31
"vext.8 q6, q0, q1, #1 \n\t"
"vext.8 q7, q0, q1, #2 \n\t"
"vld1.8 {q3}, [%[row1]]! \n\t"
"vmax.s8 q15, q0, q6 \n\t"
"vmax.s8 q15, q15, q7 \n\t"
"vext.8 q6, q2, q3, #1 \n\t"
"vext.8 q7, q2, q3, #2 \n\t"
"vld1.8 {q5}, [%[row2]]! \n\t"
"vmax.s8 q14, q2, q6 \n\t"
"vmax.s8 q14, q14, q7 \n\t"
"vext.8 q6, q4, q5, #1 \n\t"
"vext.8 q7, q4, q5, #2 \n\t"
"vmax.s8 q13, q4, q6 \n\t"
"vmax.s8 q13, q13, q7 \n\t"
"vmax.s8 q15, q15, q14 \n\t"
"vmax.s8 q15, q15, q13 \n\t"
"vmov.s8 q0, q1 \n\t"
"vmov.s8 q2, q3 \n\t"
"vmov.s8 q4, q5 \n\t"
"vst1.8 {q15}, [%[img_out]]! \n\t"
"subs %[nw], #1 \n\t"
"bne " LOOP_LABEL
"b \n\t"
"sub %[row0], #16 \n\t"
"sub %[row1], #16 \n\t"
"sub %[row2], #16 \n\t"
: [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
[row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q13", "q14", "q15");
#undef LOOP_LABEL
}
if (nw1 > 0 || left_w1 > 0) {
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm volatile(
"vld1.8 {d0}, [%[row0]]! \n\t" // d0=0-8
"vld1.8 {d2}, [%[row1]]! \n\t"
"vld1.8 {d4}, [%[row2]]! \n\t"
"mov r0, #1 \n\t"
"cmp %[nw1], #0 \n\t"
"beq " PADDLE_LABEL_LESS8
"f\n\t"
"vld1.8 {d1}, [%[row0]]! \n\t" // d1=9-15
"vext.8 d6, d0, d1, #1 \n\t"
"vext.8 d7, d0, d1, #2 \n\t"
"vld1.8 {d3}, [%[row1]]! \n\t"
"vmax.s8 d15, d0, d6 \n\t"
"vmax.s8 d15, d15, d7 \n\t"
"vext.8 d6, d2, d3, #1 \n\t"
"vext.8 d7, d2, d3, #2 \n\t"
"vld1.8 {d5}, [%[row2]]! \n\t"
"vmax.s8 d14, d2, d6 \n\t"
"vmax.s8 d14, d14, d7 \n\t"
"vext.8 d6, d4, d5, #1 \n\t"
"vext.8 d7, d4, d5, #2 \n\t"
"vmax.s8 d13, d4, d6 \n\t"
"vmax.s8 d13, d13, d7 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
"vmov.s8 d0, d1 \n\t"
"vmov.s8 d2, d3 \n\t"
"vmov.s8 d4, d5 \n\t"
"vst1.8 {d15}, [%[img_out]]! \n\t"
PADDLE_LABEL_LESS8
": \n\t"
"cmp %[left_w1], #0 \n\t"
"beq " PADDLE_LABEL_OVER
"f\n\t"
"vld1.8 {d1}, [%[row0]] \n\t" // d1=9-15
"vext.8 d6, d0, d1, #1 \n\t"
"vext.8 d7, d0, d1, #2 \n\t"
"vld1.8 {d3}, [%[row1]] \n\t"
"vmax.s8 d15, d0, d6 \n\t"
"vmax.s8 d15, d15, d7 \n\t"
"vext.8 d6, d2, d3, #1 \n\t"
"vext.8 d7, d2, d3, #2 \n\t"
"vld1.8 {d5}, [%[row2]] \n\t"
"vmax.s8 d14, d2, d6 \n\t"
"vmax.s8 d14, d14, d7 \n\t"
"vext.8 d6, d4, d5, #1 \n\t"
"vext.8 d7, d4, d5, #2 \n\t"
"vmax.s8 d13, d4, d6 \n\t"
"vmax.s8 d13, d13, d7 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
PADDLE_LABEL_LESS8_SAVE
": \n\t"
"vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
"add %[row0], %[row0], #1 \n\t"
"add %[row1], %[row1], #1 \n\t"
"add %[row2], %[row2], #1 \n\t"
"vext.8 d15, d15, d15, #1 \n\t"
"subs %[left_w1], #1 \n\t"
"bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
PADDLE_LABEL_OVER ": \n\t"
: [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
[row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7", "d13", "d14", "d15");
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t left = w_out;
while (left > 0) {
const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
*img_out = std::max(std::max(max0, max1), max2);
row0 += 1;
row1 += 1;
row2 += 1;
img_out++;
left--;
}
#endif // __ARM_NEON
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h,
int32_t pad_w) {
Tensor padded_input;
if (pad_h != 0 && pad_w != 0) {
int8_t value = -SCHAR_MAX;
make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
input = &padded_input;
}
const int32_t batch_size = input->dims()[0];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int32_t output_channels = output->dims()[1];
const int32_t h_out = output->dims()[2];
const int32_t w_out = output->dims()[3];
const int32_t outputdata_channel_stride = h_out * w_out;
const int32_t inputdata_channel_stride = h_in * w_in;
const int32_t output_batch_stride =
output_channels * outputdata_channel_stride;
const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
const int8_t *input_data = input->data<int8_t>();
int8_t *output_data = output->mutable_data<int8_t>();
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < output_channels; ++j) {
const int8_t *img_in = input_data + j * inputdata_channel_stride;
int8_t *img_out = output_data + j * outputdata_channel_stride;
for (int k = 0; k < h_out; ++k) {
const int8_t *row0 = img_in + 2 * k * w_in;
const int8_t *row1 = img_in + (2 * k + 1) * w_in;
const int8_t *row2 = img_in + (2 * k + 2) * w_in;
#if __ARM_NEON
int32_t nw = w_out >> 4;
int32_t left_w = w_out & 0xf;
int32_t nw1 = left_w >> 3;
int32_t left_w1 = left_w & 0x7;
#if __aarch64__
// TODO
#else
if (nw > 0) {
#define LOOP_LABEL "1"
// result: q15
asm volatile(
"vld2.8 {q0, q1}, [%[row0]]! \n\t" // q0=0-30, q1=1-31
"vld2.8 {q2, q3}, [%[row1]]! \n\t"
"vld2.8 {q4, q5}, [%[row2]]! \n\t"
LOOP_LABEL
": \n\t"
"vmax.s8 q15, q0, q1 \n\t"
"vld2.8 {q6, q7}, [%[row0]]! \n\t" // q0=32-62, q1=33-63
"vmax.s8 q14, q2, q3 \n\t"
"vmax.s8 q13, q4, q5 \n\t"
"vld2.8 {q8, q9}, [%[row1]]! \n\t"
"vext.8 q0, q0, q6, #1 \n\t"
"vmax.s8 q15, q15, q0 \n\t"
"vld2.8 {q10, q11}, [%[row2]]! \n\t"
"vext.8 q2, q2, q8, #1 \n\t"
"vmax.s8 q14, q14, q2 \n\t"
"vext.8 q4, q4, q10, #1 \n\t"
"vmax.s8 q13, q13, q4 \n\t"
"vmax.s8 q15, q15, q14 \n\t"
"vmax.s8 q15, q15, q13 \n\t"
"vmov.s8 q0, q6 \n\t"
"vmov.s8 q1, q7 \n\t"
"vmov.s8 q2, q8 \n\t"
"vmov.s8 q3, q9 \n\t"
"vmov.s8 q4, q10 \n\t"
"vmov.s8 q5, q11 \n\t"
"vst1.8 {q15}, [%[img_out]]! \n\t"
"subs %[nw], #1 \n\t"
"bne " LOOP_LABEL
"b \n\t"
"sub %[row0], #32 \n\t"
"sub %[row1], #32 \n\t"
"sub %[row2], #32 \n\t"
: [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
[row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q13", "q14", "q15");
#undef LOOP_LABEL
}
if (nw1 > 0 || left_w1 > 0) {
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm volatile(
"vld2.8 {d0, d1}, [%[row0]]! \n\t" // d0=0-14, d1=1-15
"vld2.8 {d2, d3}, [%[row1]]! \n\t"
"vld2.8 {d4, d5}, [%[row2]]! \n\t"
"mov r0, #1 \n\t"
"cmp %[nw1], #0 \n\t"
"beq " PADDLE_LABEL_LESS8
"f\n\t"
"vmax.s8 d15, d0, d1 \n\t"
"vld2.8 {d6, d7}, [%[row0]]! \n\t" // d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3 \n\t"
"vmax.s8 d13, d4, d5 \n\t"
"vld2.8 {d8, d9}, [%[row1]]! \n\t"
"vext.8 d0, d0, d6, #1 \n\t"
"vmax.s8 d15, d15, d0 \n\t"
"vld2.8 {d10, d11}, [%[row2]]! \n\t"
"vext.8 d2, d2, d8, #1 \n\t"
"vmax.s8 d14, d14, d2 \n\t"
"vext.8 d4, d4, d10, #1 \n\t"
"vmax.s8 d13, d13, d4 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
"vmov.s8 d0, d6 \n\t"
"vmov.s8 d1, d7 \n\t"
"vmov.s8 d2, d8 \n\t"
"vmov.s8 d3, d9 \n\t"
"vmov.s8 d4, d10 \n\t"
"vmov.s8 d5, d11 \n\t"
"vst1.8 {d15}, [%[img_out]]! \n\t"
PADDLE_LABEL_LESS8
": \n\t"
"cmp %[left_w1], #0 \n\t"
"beq " PADDLE_LABEL_OVER
"f\n\t"
"vmax.s8 d15, d0, d1 \n\t"
"vld2.8 {d6, d7}, [%[row0]] \n\t" // d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3 \n\t"
"vmax.s8 d13, d4, d5 \n\t"
"vld2.8 {d8, d9}, [%[row1]] \n\t"
"vext.8 d0, d0, d6, #1 \n\t"
"vmax.s8 d15, d15, d0 \n\t"
"vld2.8 {d10, d11}, [%[row2]] \n\t"
"vext.8 d2, d2, d8, #1 \n\t"
"vmax.s8 d14, d14, d2 \n\t"
"vext.8 d4, d4, d10, #1 \n\t"
"vmax.s8 d13, d13, d4 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
PADDLE_LABEL_LESS8_SAVE
": \n\t"
"vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
"add %[row0], %[row0], #2 \n\t"
"add %[row1], %[row1], #2 \n\t"
"add %[row2], %[row2], #2 \n\t"
"vext.8 d15, d15, d15, #1 \n\t"
"subs %[left_w1], #1 \n\t"
"bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
PADDLE_LABEL_OVER ": \n\t"
: [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
[row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15");
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t left = w_out;
while (left > 0) {
const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
*img_out = std::max(std::max(max0, max1), max2);
row0 += 2;
row1 += 2;
row2 += 2;
img_out++;
left--;
}
#endif // __ARM_NEON
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
void Pool3x3Max_int8(const vector<int> &strides, const vector<int> &paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
// const int _kernel_size = 3;
const int stride = strides[0];
// const int stride_width = strides[1];
const int padding = paddings[0];
// const int padding_width = paddings[1];
const int8_t negative_max = -SCHAR_MAX;
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int8_t *input_data = input->data<int8_t>();
int8_t *output_data = output->mutable_data<int8_t>();
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) {
const int8_t *input_seg = input_data + c * input_channel_stride;
int8_t *output_seg = output_data + c * output_channel_stride;
for (int ph = 0; ph < output_height; ph++) {
int hstart = ph * stride - padding;
int hend = min(hstart + 3, input_height);
hstart = max(hstart, 0);
for (int pw = 0; pw < output_width; pw++) {
int wstart = pw * stride - padding;
int wend = min(wstart + 3, input_width);
wstart = max(wstart, 0);
const int8_t *pos1 = input_seg + hstart * input_width + wstart;
const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart;
const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart;
int8_t *output_ptr = output_seg + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) {
int8_t max_value = -SCHAR_MAX;
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
int8_t value = input_seg[h * input_width + w];
if (value > max_value) {
max_value = value;
}
}
}
output_seg[ph * output_width + pw] = max_value;
} else {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.8 {d0}, [%[pos1]] \n\t"
"vld1.8 {d1}, [%[pos2]] \n\t"
"vld1.8 {d2}, [%[pos3]] \n\t"
"vmax.s8 d3, d0, d1 \n\t"
"vmax.s8 d4, d2, d3 \n\t"
"vmov.s8 d4[3], %[negative_max] \n\t"
"vpmax.s8 d5, d4, d4 \n\t"
"vpmax.s8 d6, d5, d5 \n\t"
"vst1.8 {d6[0]},[%[output_ptr]] \n\t"
:
: [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q0", "q1", "q2", "q3");
#endif
#else
const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]);
const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]);
const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]);
*output_ptr = std::max(std::max(max0, max1), max2);
#endif // __ARM_NEON
}
}
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> {
int wend = std::min(wstart + ksize_width, input_width); int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
T ele = pool_process.initial(); auto ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute(input_data[h * input_width + w], &ele); pool_process.compute(input_data[h * input_width + w], &ele);
} }
} }
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(static_cast<T>(pool_size), &ele); pool_process.finalize(static_cast<float>(pool_size), &ele);
output_data[ph * output_width + pw] = ele; output_data[ph * output_width + pw] = static_cast<T>(ele);
} }
} }
input_data += input_stride; input_data += input_stride;
...@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> {
} }
}; };
template class PoolFunctor<CPU, math::AvgPool<float>, float>; template class PoolFunctor<CPU, math::AvgPool<float, float>, float>;
template class PoolFunctor<CPU, math::MaxPool<float>, float>; template class PoolFunctor<CPU, math::MaxPool<float>, float>;
template class PoolFunctor<CPU, math::AvgPool<int8_t, int32_t>, int8_t>;
template class PoolFunctor<CPU, math::MaxPool<int8_t>, int8_t>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#pragma once #pragma once
#include <climits>
#include <cmath>
#include "common/log.h" #include "common/log.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "pool_2x2.h" #include "pool_2x2.h"
...@@ -37,24 +39,42 @@ namespace math { ...@@ -37,24 +39,42 @@ namespace math {
* in pool pooling, and finally takes the average. * in pool pooling, and finally takes the average.
* MaxPoolGrad and AvgPoolGrad are gradient operations respectively. * MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
*/ */
template <class T> template <typename T>
class MaxPool { class MaxPool {
public: public:
inline T initial() { return static_cast<T>(-FLT_MAX); } inline T initial() {
if (typeid(T) == typeid(int8_t)) {
return static_cast<T>(-SCHAR_MAX);
}
return static_cast<T>(-FLT_MAX);
}
inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; } inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; }
inline void finalize(const T &pool_field, T *y) {} inline void finalize(const T &pool_field, T *y) {}
}; };
template <class T> template <typename Itype, typename Otype>
class AvgPool { class AvgPool {
public: public:
inline T initial() { return static_cast<T>(0); } inline Otype initial() { return static_cast<Otype>(0); }
inline void compute(const T &x, T *y) { *y += x; } inline void compute(const Itype &x, Otype *y) { *y += x; }
inline void finalize(const T &pool_field, T *y) { *y /= pool_field; } inline void finalize(const float &pool_field, Otype *y) {
if (typeid(Itype) == typeid(int8_t)) {
float tmp = *y / pool_field;
if (tmp > SCHAR_MAX) {
*y = SCHAR_MAX;
} else if (tmp < -SCHAR_MAX) {
*y = -SCHAR_MAX;
} else {
*y = static_cast<Otype>(std::round(tmp));
}
} else {
*y /= pool_field;
}
}
}; };
template <typename DeviceType, typename PoolProcess, typename T> template <typename DeviceType, typename PoolProcess, typename T>
......
...@@ -439,7 +439,7 @@ class ConvParam : public OpParam { ...@@ -439,7 +439,7 @@ class ConvParam : public OpParam {
#endif #endif
private: protected:
RType *input_; RType *input_;
RType *output_; RType *output_;
RType *filter_; RType *filter_;
...@@ -1707,7 +1707,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> { ...@@ -1707,7 +1707,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
FusionConvAddReluParam(const VariableNameMap &inputs, FusionConvAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) const AttributeMap &attrs, const Scope &scope)
: FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {} : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
#ifdef FUSION_CONVADDRELU_INT8_OP
scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
#endif
}
#ifdef FUSION_CONVADDRELU_INT8_OP
typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
const RType *InputScale() const { return scale_; }
protected:
RType *scale_;
#endif
}; };
#endif #endif
......
...@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH) ...@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH)
#gen test #gen test
ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool paddle-mobile) target_link_libraries(test-pool-op paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
...@@ -324,6 +324,10 @@ if (NOT FOUND_MATCH) ...@@ -324,6 +324,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile) target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile) target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <cstdlib> #include <cstdlib>
#include <ctime> #include <ctime>
#include <iostream> #include <iostream>
#include <limits>
#include <random> #include <random>
#include "../test_helper.h" #include "../test_helper.h"
#include "common/log.h" #include "common/log.h"
...@@ -54,6 +55,37 @@ void print_matirx(int m, int n, int ldc, int8_t *c) { ...@@ -54,6 +55,37 @@ void print_matirx(int m, int n, int ldc, int8_t *c) {
std::cout << std::endl; std::cout << std::endl;
} }
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
int do_sgemm(int m, int n, int k, bool relu, int pr) { int do_sgemm(int m, int n, int k, bool relu, int pr) {
int lda = k; int lda = k;
int ldb = n; int ldb = n;
...@@ -126,10 +158,97 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { ...@@ -126,10 +158,97 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
return 0; return 0;
} }
int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
int lda = k;
int ldb = n;
int ldc = n;
float scale = 0.00628f;
default_random_engine e;
uniform_int_distribution<int8_t> pixel(-127, 127);
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
int8_t *b = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
int8_t *c = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
int8_t *c1 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
int32_t *bias =
static_cast<int32_t *>(paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
for (int i = 0; i < m * k; ++i) {
a[i] = pixel(e);
}
for (int i = 0; i < k * n; ++i) {
b[i] = pixel(e);
}
for (int i = 0; i < m; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
int32_t bias_v = bias[i];
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
}
paddle_mobile::operators::math::Gemm gemm;
#ifdef _OPENMP
gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
#else
gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
#endif
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) {
if (c[i] == c1[i]) {
++eq;
} else {
++neq;
}
}
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
std::cout << "Bias:" << std::endl;
print_matirx(m, 1, 1, bias);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
paddle_mobile::memory::Free(bias);
return 0;
}
int main() { int main() {
#ifdef _OPENMP #ifdef _OPENMP
omp_set_num_threads(8); omp_set_num_threads(4);
#endif #endif
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm without bias:" << std::endl;
do_sgemm(9, 9, 9, false, 1); do_sgemm(9, 9, 9, false, 1);
do_sgemm(10, 6, 12, false, 0); do_sgemm(10, 6, 12, false, 0);
do_sgemm(512, 256, 384, false, 0); do_sgemm(512, 256, 384, false, 0);
...@@ -140,5 +259,31 @@ int main() { ...@@ -140,5 +259,31 @@ int main() {
do_sgemm(333, 797, 939, false, 0); do_sgemm(333, 797, 939, false, 0);
do_sgemm(1024, 1024, 1024, false, 0); do_sgemm(1024, 1024, 1024, false, 0);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with bias:" << std::endl;
do_sgemm_with_bias(9, 9, 9, false, 1);
do_sgemm_with_bias(10, 6, 12, false, 0);
do_sgemm_with_bias(512, 256, 384, false, 0);
do_sgemm_with_bias(1366, 768, 256, false, 0);
do_sgemm_with_bias(1255, 755, 333, false, 0);
do_sgemm_with_bias(599, 1133, 393, false, 0);
do_sgemm_with_bias(777, 555, 999, false, 0);
do_sgemm_with_bias(333, 797, 939, false, 0);
do_sgemm_with_bias(1024, 1024, 1024, false, 0);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with relu and bias:" << std::endl;
do_sgemm_with_bias(9, 9, 9, true, 1);
do_sgemm_with_bias(10, 6, 12, true, 0);
do_sgemm_with_bias(512, 256, 384, true, 0);
do_sgemm_with_bias(1366, 768, 256, true, 0);
do_sgemm_with_bias(1255, 755, 333, true, 0);
do_sgemm_with_bias(599, 1133, 393, true, 0);
do_sgemm_with_bias(777, 555, 999, true, 0);
do_sgemm_with_bias(333, 797, 939, true, 0);
do_sgemm_with_bias(1024, 1024, 1024, true, 0);
return 0; return 0;
} }
...@@ -28,7 +28,7 @@ limitations under the License. */ ...@@ -28,7 +28,7 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(8); paddle_mobile.SetThreadNum(4);
Tensor aa, bb, cc; Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k}); auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n}); auto bbptr = bb.mutable_data<float>({k, n});
...@@ -44,10 +44,12 @@ int main() { ...@@ -44,10 +44,12 @@ int main() {
ccptr[i] = 2; ccptr[i] = 2;
} }
Tensor aa_int8, bb_int8, cc_int8; Tensor aa_int8, bb_int8, cc_int32, cc_int8;
auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k}); auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n}); auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n}); auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
int32_t* bias_data = new int32_t[m];
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2); aaptr_int8[i] = static_cast<int8_t>(2);
...@@ -56,7 +58,11 @@ int main() { ...@@ -56,7 +58,11 @@ int main() {
bbptr_int8[i] = static_cast<int8_t>(2); bbptr_int8[i] = static_cast<int8_t>(2);
} }
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
ccptr_int8[i] = static_cast<int32_t>(2); ccptr_int32[i] = static_cast<int32_t>(2);
}
for (int i = 0; i < m; ++i) {
bias_data[i] = 2;
} }
// float // float
...@@ -76,22 +82,41 @@ int main() { ...@@ -76,22 +82,41 @@ int main() {
auto time2 = time(); auto time2 = time();
std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
// int8_t // int8_t without bias
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>( paddle_mobile::operators::math::matmul<float, int32_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<int8_t>(0), false, nullptr); static_cast<float>(0));
} }
auto time3 = time(); auto time3 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>( paddle_mobile::operators::math::matmul<float, int32_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<int8_t>(0), false, nullptr); static_cast<float>(0));
} }
auto time4 = time(); auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n"; std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
// int8_t with bias&relu
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data);
}
auto time5 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data);
}
auto time6 = time();
std::cout << "int8_t gemm_with_bias_relu cost :"
<< time_diff(time5, time6) / 10 << "ms\n";
delete[] bias_data;
return 0; return 0;
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <iostream>
#include <limits>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/fusion_conv_add_relu_int8_op.h"
namespace paddle_mobile {
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
// Reference convolution from Caffe for checking results.
// accumulate through explicit loops over input, output, and filters.
template <typename T>
void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
const framework::Tensor *bias, const framework::AttributeMap &attrs,
framework::Tensor *output, float scale) {
framework::AttrReader attr_reader(attrs);
std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
int groups = attr_reader.Get<int>("groups");
int kernel_h = filter->dims()[2];
int kernel_w = filter->dims()[3];
int pad_h = paddings[0];
int pad_w = paddings[1];
int stride_h = strides[0];
int stride_w = strides[1];
int dilation_h = dilations[0];
int dilation_w = dilations[1];
auto in_shape = input->dims();
auto out_shape = output->dims();
const bool has_depth = 0;
int kernel_d, pad_d, stride_d, dilation_d;
if (has_depth) {
kernel_d = kernel_h;
stride_d = stride_h;
pad_d = pad_h;
dilation_d = dilation_h;
} else {
kernel_d = stride_d = dilation_d = 1;
pad_d = 0;
}
// Groups
int o_g = out_shape[1] / groups;
int k_g = in_shape[1] / groups;
int o_head, k_head;
// Convolution
vector<int> weight_offset(4 + has_depth);
vector<int> in_offset(4 + has_depth);
vector<int> out_offset(4 + has_depth);
auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
framework::DDim shape = input->dims();
size_t count = 0;
for (int i = 0; i < indics.size(); ++i) {
count *= shape[i];
count += indics[i];
}
return count;
};
const T *in_data = input->data<T>();
const T *w_data = filter->data<T>();
framework::Tensor output_32;
int32_t *out_data_32 = output_32.mutable_data<int32_t>(out_shape);
memset(out_data_32, 0, output_32.numel() * sizeof(int32_t));
for (int n = 0; n < out_shape[0]; n++) {
for (int g = 0; g < groups; g++) {
o_head = o_g * g;
k_head = k_g * g;
for (int o = 0; o < o_g; o++) {
for (int k = 0; k < k_g; k++) {
for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
for (int y = 0; y < out_shape[2 + has_depth]; y++) {
for (int x = 0; x < out_shape[3 + has_depth]; x++) {
for (int r = 0; r < kernel_d; r++) {
for (int p = 0; p < kernel_h; p++) {
for (int q = 0; q < kernel_w; q++) {
int in_z = z * stride_d - pad_d + r * dilation_d;
int in_y = y * stride_h - pad_h + p * dilation_h;
int in_x = x * stride_w - pad_w + q * dilation_w;
if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
in_y >= 0 && in_y < in_shape[2 + has_depth] &&
in_x >= 0 && in_x < in_shape[3 + has_depth]) {
weight_offset[0] = o + o_head;
weight_offset[1] = k;
if (has_depth) {
weight_offset[2] = r;
}
weight_offset[2 + has_depth] = p;
weight_offset[3 + has_depth] = q;
in_offset[0] = n;
in_offset[1] = k + k_head;
if (has_depth) {
in_offset[2] = in_z;
}
in_offset[2 + has_depth] = in_y;
in_offset[3 + has_depth] = in_x;
out_offset[0] = n;
out_offset[1] = o + o_head;
if (has_depth) {
out_offset[2] = z;
}
out_offset[2 + has_depth] = y;
out_offset[3 + has_depth] = x;
out_data_32[offset(output, out_offset)] +=
in_data[offset(input, in_offset)] *
w_data[offset(filter, weight_offset)];
}
}
}
}
}
}
}
}
}
}
}
T *out_data = output->mutable_data<T>();
int32_t n = out_shape[0];
int32_t c = out_shape[1];
int32_t h = out_shape[2];
int32_t w = out_shape[3];
const int32_t *bias_data = bias->data<int32_t>();
for (int i = 0; i < n; ++i) {
for (int j = 0; j < c; ++j) {
int32_t bias_v = bias_data[j];
for (int k = 0; k < h; ++k) {
for (int l = 0; l < w; ++l) {
int32_t tmp = out_data_32[i * c * h * w + j * h * w + k * w + l];
tmp = qadd_int32(tmp, bias_v);
tmp = std::max(0, tmp);
out_data[i * c * h * w + j * h * w + k * w + l] =
qscale_int32(tmp, scale);
}
}
}
}
}
template <typename T, int Kernel, int Pad, int Stride>
int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
int pad_w = Pad;
int stride_h = Stride;
int stride_w = Stride;
int dilation_h = 1;
int dilation_w = 1;
int batch_size = 1;
int input_c = in_channels;
int input_h = in_height;
int input_w = in_width;
int output_c = out_channels;
framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w});
framework::DDim filter_shape =
framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
framework::DDim output_shape = framework::make_ddim(
std::vector<int>({batch_size, output_c, output_h, output_w}));
framework::DDim bias_shape = framework::make_ddim({output_c});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["Input"] = std::vector<std::string>({"input"});
inputs["Filter"] = std::vector<std::string>({"filter"});
inputs["Scale"] = std::vector<std::string>({"scale"});
inputs["Y"] = std::vector<std::string>({"bias"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(input, input_shape, -127, 127);
auto filter_var = scope.get()->Var("filter");
auto filter = filter_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(filter, filter_shape, -127, 127);
auto scale_var = scope.get()->Var("scale");
auto scale = scale_var->template GetMutable<framework::LoDTensor>();
scale->Resize(framework::make_ddim({1}));
float scale_v = 0.000828f;
scale->mutable_data<float>()[0] = scale_v;
auto bias_var = scope.get()->Var("bias");
auto bias = bias_var->template GetMutable<framework::LoDTensor>();
SetupTensor<int32_t>(bias, bias_shape, -127, 127);
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["dilations"].Set<vector<int>>(
std::vector<int>({dilation_h, dilation_w}));
attrs["groups"].Set<int>(1);
attrs["axis"].Set<int>(0);
auto *op = new operators::FusionConvAddReluInt8Op<CPU, T>(
"fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
output_cmp.mutable_data<T>(output_shape);
conv2d<T>(input, filter, bias, attrs, &output_cmp, scale_v);
// compare results
int eq = 0;
int neq = 0;
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
T *output_cmp_data = output_cmp.data<T>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(
output_data[i] == output_cmp_data[i],
"The execution of test_fusion_conv_add_relu_int8_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
++eq;
} else {
++neq;
}
}
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
if (argc < 5) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-conv-add-relu-int8-op in_channels in_height in_width "
"out_channels\n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
<< " -in_width: int, input image's width\n"
<< " -out_channels: int, conv output channels\n";
return 1;
}
int in_channels = atoi(argv[1]);
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
int out_channels = atoi(argv[4]);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8_t, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 0, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 1, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 3, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 3, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 5, stride = 3
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
paddle_mobile::TestConvOp<int8_t, 7, 5, 3>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 4
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
paddle_mobile::TestConvOp<int8_t, 7, 3, 4>(in_channels, in_height, in_width,
out_channels);
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 5, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
out_channels);
}
#endif
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream>
#include "../test_helper.h" #include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
...@@ -79,14 +80,14 @@ int TestMulOP() { ...@@ -79,14 +80,14 @@ int TestMulOP() {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i, output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i])); static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) { if (output_data[i] == c[i]) {
++eq; ++eq;
} else { } else {
++neq; ++neq;
} }
} }
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq; << " neq=" << neq << std::endl;
delete op; delete op;
return 0; return 0;
} }
...@@ -94,7 +95,7 @@ int TestMulOP() { ...@@ -94,7 +95,7 @@ int TestMulOP() {
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(8); paddle_mobile.SetThreadNum(4);
paddle_mobile::TestMulOP<int8_t, int32_t>(); paddle_mobile::TestMulOP<int8_t, int32_t>();
paddle_mobile::TestMulOP<float, float>(); paddle_mobile::TestMulOP<float, float>();
return 0; return 0;
......
...@@ -12,30 +12,281 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,30 +12,281 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream>
#include "../test_include.h" #include "../test_include.h"
#include "operators/kernel/central-arm-func/pool_arm_func.h"
#include "operators/pool_op.h" #include "operators/pool_op.h"
int main() { namespace paddle_mobile {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader; static int PoolOutputSize(int input_size, int filter_size, int padding,
auto program = loader.Load(std::string(g_googlenet)); int stride, bool ceil_mode) {
if (program.originProgram == nullptr) { int output_size;
DLOG << "program read file"; if (!ceil_mode) {
output_size = (input_size - filter_size + 2 * padding) / stride + 1;
} else {
output_size =
(input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
} }
return output_size;
}
template <typename T>
static void PoolAvgPad0(std::vector<int> ksize, std::vector<int> strides,
const framework::Tensor *input,
framework::Tensor *out) {
const int32_t batch_size = input->dims()[0];
const int32_t input_c = input->dims()[1];
const int32_t input_h = input->dims()[2];
const int32_t input_w = input->dims()[3];
const int32_t out_c = out->dims()[1];
const int32_t out_h = out->dims()[2];
const int32_t out_w = out->dims()[3];
const int32_t kernel_h = ksize[0];
const int32_t kernel_w = ksize[1];
const int32_t stride_h = strides[0];
const int32_t stride_w = strides[1];
const int32_t inputdata_channel_stride = input_h * input_w;
const int32_t input_batch_stride = input_c * inputdata_channel_stride;
const int32_t outputdata_channel_stride = out_h * out_w;
const int32_t output_batch_stride = out_c * outputdata_channel_stride;
T *out_data = out->mutable_data<T>();
const T *input_data = input->data<T>();
const T **rows = new const T *[kernel_h];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < out_c; ++j) {
const T *img_in = input_data + j * inputdata_channel_stride;
T *img_out = out_data + j * outputdata_channel_stride;
for (int k = 0; k < out_h; ++k) {
for (int m = 0; m < kernel_h; ++m) {
rows[m] = img_in + (stride_h * k + m) * input_w;
}
int32_t left = out_w;
while (left > 0) {
float tmp = 0;
for (int m = 0; m < kernel_h; ++m) {
for (int l = 0; l < kernel_w; ++l) {
tmp += rows[m][l];
}
}
if (typeid(T) == typeid(int8_t)) {
tmp = tmp / (kernel_h * kernel_w);
if (tmp < -127) {
*img_out = -127;
} else if (tmp > 127) {
*img_out = 127;
} else {
*img_out = static_cast<T>(std::round(tmp));
}
} else {
*img_out = static_cast<T>(tmp / (kernel_h * kernel_w));
}
for (int m = 0; m < kernel_h; ++m) {
rows[m] += stride_w;
}
img_out++;
left--;
}
}
}
input_data += input_batch_stride;
out_data += output_batch_stride;
}
delete[] rows;
}
template <typename T, int CeilMode, int PoolType, int Kernel, int Pad,
int Stride>
int TestPoolOp(int in_channels, int in_height, int in_width) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
int pad_w = Pad;
int stride_h = Stride;
int stride_w = Stride;
bool ceil_mode = CeilMode != 0;
std::string pooling_type = (PoolType == 0 ? "max" : "avg");
Executor4Test<paddle_mobile::CPU, int batch_size = 1;
paddle_mobile::operators::PoolOp<paddle_mobile::CPU, float>> int input_c = in_channels;
executor(program, "pool2d"); int input_h = in_height;
int input_w = in_width;
paddle_mobile::framework::Tensor input; framework::DDim input_shape =
SetupTensor<float>(&input, {1, 64, 112, 112}, static_cast<float>(0), framework::make_ddim({batch_size, input_c, input_h, input_w});
static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
auto output =
executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
float *output_ptr = output->data<float>(); std::vector<int64_t> output_shape_v({batch_size, input_c});
for (int j = 0; j < output->numel(); ++j) { output_shape_v.push_back(
DLOG << " value of output: " << output_ptr[j]; PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode));
output_shape_v.push_back(
PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode));
framework::DDim output_shape = framework::make_ddim(output_shape_v);
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["X"] = std::vector<std::string>({"input"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(input, input_shape, -127, 127);
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["pooling_type"].SetString(pooling_type);
attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["ceil_mode"].Set<bool>(false);
attrs["global_pooling"].Set<bool>(false);
auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
output_cmp.mutable_data<T>(output_shape);
if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) {
PoolAvgPad0<T>(std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, input, &output_cmp);
} else {
if (typeid(T) == typeid(int8_t)) {
operators::PoolBasic<int8_t, int32_t>(
pooling_type, std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
input, &output_cmp);
} else {
operators::PoolBasic<float, float>(
pooling_type, std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
input, &output_cmp);
}
} }
// compare results
int eq = 0;
int neq = 0;
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
T *output_cmp_data = output_cmp.data<T>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"The execution of test_pool_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
++eq;
} else {
++neq;
}
}
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0; return 0;
} }
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
if (argc < 4) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-pool-op in_channels in_height in_width \n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
<< " -in_width: int, input image's width\n";
return 1;
}
int in_channels = atoi(argv[1]);
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
#if __ARM_NEON
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
paddle_mobile::TestPoolOp<float, 0, 0, 3, 1, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
paddle_mobile::TestPoolOp<float, 0, 0, 3, 0, 2>(in_channels, in_height,
in_width);
#endif
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 3, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 3, 3>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 1>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 2>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 3>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 3>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 4
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4";
paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
in_width);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1";
paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
in_width);
}
...@@ -213,6 +213,7 @@ if(NOT FOUND_MATCH) ...@@ -213,6 +213,7 @@ if(NOT FOUND_MATCH)
set(FUSION_CONVADD_OP ON) set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADDPRELU_OP ON) set(FUSION_CONVADDPRELU_OP ON)
set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDRELU_INT8_OP ON)
set(FUSION_FC_OP ON) set(FUSION_FC_OP ON)
set(LRN_OP ON) set(LRN_OP ON)
set(MUL_OP ON) set(MUL_OP ON)
...@@ -309,6 +310,9 @@ endif() ...@@ -309,6 +310,9 @@ endif()
if (FUSION_CONVADDRELU_OP) if (FUSION_CONVADDRELU_OP)
add_definitions(-DFUSION_CONVADDRELU_OP) add_definitions(-DFUSION_CONVADDRELU_OP)
endif() endif()
if (FUSION_CONVADDRELU_INT8_OP)
add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
endif()
if (FUSION_CONVADDPRELU_OP) if (FUSION_CONVADDPRELU_OP)
add_definitions(-DFUSION_CONVADDPRELU_OP) add_definitions(-DFUSION_CONVADDPRELU_OP)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册