diff --git a/src/common/types.cpp b/src/common/types.cpp index cf2c4dc87613b4641d7c1126e22d2e4a45ff9594..734c0219ac2665792fee33272872cbdf325e0f08 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu"; const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; const char *G_OP_TYPE_FC = "fusion_fc"; +const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8"; const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; const char *G_OP_TYPE_LRN = "lrn"; const char *G_OP_TYPE_MUL = "mul"; @@ -111,12 +112,13 @@ std::unordered_map< {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, + {G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, diff --git a/src/common/types.h b/src/common/types.h index a63d2efd23ebdef1ebb0b6d40d356c33574b3818..fb960c052e640ce7cb4bf258c2a9ef12b58c3367 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8; extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; extern const char *G_OP_TYPE_FC; +extern const char *G_OP_TYPE_FC_INT8; extern const char *G_OP_TYPE_FUSION_CONV_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; diff --git a/src/operators/fusion_conv_add_relu_int8_op.h b/src/operators/fusion_conv_add_relu_int8_op.h index 5e4b4c08065de8111ae5511b5e9448bacda74c8b..dca92586ca4f28b43b70a3ccddfc12cfc0d55cb7 100644 --- a/src/operators/fusion_conv_add_relu_int8_op.h +++ b/src/operators/fusion_conv_add_relu_int8_op.h @@ -22,19 +22,19 @@ namespace paddle_mobile { namespace operators { template class FusionConvAddReluInt8Op - : public framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel> { + : public framework::OperatorWithKernel, + ConvAddReluKernel> { public: FusionConvAddReluInt8Op(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap &attrs, std::shared_ptr scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel>(type, inputs, outputs, - attrs, scope) {} + : framework::OperatorWithKernel, + ConvAddReluKernel>( + type, inputs, outputs, attrs, scope) {} void InferShape() const override; }; } // namespace operators diff --git a/src/operators/fusion_fc_int8_op.cpp b/src/operators/fusion_fc_int8_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f7c621cc2c9e1a484a6f41674a65075cfeb69015 --- /dev/null +++ b/src/operators/fusion_fc_int8_op.cpp @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_FC_INT8_OP + +#include "operators/fusion_fc_int8_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void FusionFcInt8Op::InferShape() const { + auto x_dims = this->param_.InputX()->dims(); + auto y_dims = this->param_.InputY()->dims(); + int x_num_col_dims = this->param_.XNumColDims(); + int y_num_col_dims = this->param_.YNumColDims(); + + assert(x_dims.size() > x_num_col_dims); + assert(y_dims.size() > y_num_col_dims); + + /// (1,2,3,4) , x_num_col_dims = 2 -> (2,12) + auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); + auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); + + assert(x_mat_dims[1] == y_mat_dims[0]); + + std::vector output_dims; + output_dims.reserve( + static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); + + for (int i = 0; i < x_num_col_dims; ++i) { + output_dims.push_back(x_dims[i]); + } + + for (int i = y_num_col_dims; i < y_dims.size(); ++i) { + output_dims.push_back(y_dims[i]); + } + + framework::DDim ddim = framework::make_ddim(output_dims); + this->param_.Out()->Resize(ddim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op); +#endif +#endif // FUSION_FC_INT8_OP diff --git a/src/operators/fusion_fc_int8_op.h b/src/operators/fusion_fc_int8_op.h new file mode 100644 index 0000000000000000000000000000000000000000..83718c0eae8b51be0107182c079cb3e918ebaf3a --- /dev/null +++ b/src/operators/fusion_fc_int8_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_FC_INT8_OP + +#pragma once + +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/fusion_fc_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class FusionFcInt8Op + : public framework::OperatorWithKernel, + FusionFcKernel> { + public: + FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + FusionFcKernel>( + type, inputs, outputs, attrs, scope) {} + + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_FC_INT8_OP diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp index c503edab643def7af0585a18d774b14ca0a3c39d..749dc9ac12158dfc174a53a923bc9a057375c3c3 100644 --- a/src/operators/kernel/arm/fusion_fc_kernel.cpp +++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp @@ -27,10 +27,27 @@ bool FusionFcKernel::Init(FusionFcParam *param) { template <> void FusionFcKernel::Compute(const FusionFcParam ¶m) { - FusionFcCompute(param); + FusionFcCompute(param); param.Out()->set_lod(param.InputX()->lod()); } +template class FusionFcKernel; + +#ifdef FUSION_FC_INT8_OP +template <> +bool FusionFcKernel::Init(FusionFcParam *param) { + return true; +} + +template <> +void FusionFcKernel::Compute(const FusionFcParam ¶m) { + FusionFcCompute(param); + param.Out()->set_lod(param.InputX()->lod()); +} + +template class FusionFcKernel; +#endif + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h index 9e46790cfe6f8d21f6c466c64853b5efc7db927c..e6b337c194f031604911b4d66a2ba8c79787d5b7 100644 --- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h @@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { float beta = 1.0f; #ifdef FUSION_CONVADDRELU_INT8_OP - Tensor scale = *param.InputScale(); - alpha = scale.data()[0]; + alpha = param.InputScale()->data()[0]; beta = 0.0f; #endif diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h index 45d5dc76d1e95668638706a252cc24d7ff2dec40..c7d4a98b36e4c398282ab3f4e98ffdfefaa676ef 100644 --- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h +++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h @@ -15,23 +15,29 @@ limitations under the License. */ #ifdef FUSION_FC_OP #pragma once + +#include #include "operators/math/math_function.h" #include "operators/op_param.h" namespace paddle_mobile { namespace operators { -template +template void FusionFcCompute(const FusionFcParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); - const Tensor *input_z = param.InputZ(); - auto *input_z_data = input_z->data(); + Tensor *input_z = param.InputZ(); + S *input_z_data = input_z->data(); int axis = param.Axis(); Tensor *out = param.Out(); // int m = out->dims()[0]; // int n = out->dims()[1]; - auto *out_data = out->mutable_data(); + auto *out_data = out->mutable_data

(); + + float alpha = 1.0f; + float beta = 1.0f; + const Tensor x_matrix = input_x->dims().size() > 2 ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) @@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam ¶m) { axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. "); - int64_t classes = input_z->numel(); - for (int i = 0; i < out_dim[0]; i++) { - memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); - } + if (std::is_same::value) { +#ifdef FUSION_FC_INT8_OP + alpha = param.InputScale()->data()[0]; + beta = 0.0f; + math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false, + input_z_data, true); +#endif + } else { + // bias_data的维度和out的第二个维度一致 + int64_t classes = input_z->numel(); + for (int i = 0; i < out_dim[0]; i++) { + memory::Copy(out_data + i * classes, input_z_data, + sizeof(float) * classes); + } - // for (int i = 0; i < out->numel(); i++) { - // DLOG << out_data[i]; - // } - // bias_data的维度和out的维度一致 - math::matmul(x_matrix, false, y_matrix, false, static_cast(1), - out, static_cast(1), false); + math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, + false); + } PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); - // if (out_dim.size() != 2) { - // out->Resize(out_dim); - // } + // if (out_dim.size() != 2) { + // out->Resize(out_dim); + // } } } // namespace operators diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index ae324dbfd383aa2aa93b848710ff5d67c7b4893c..c17b2a5e4df0f0ca88da79a9ce55c2ecae0316b5 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, #endif // __ARM_NEON // 32位 float 矩阵乘法 -template <> void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *bias) { @@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda, } // 32位 float 矩阵乘法 -template <> void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *bias) { diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 61e957100b35ee2bd16f03ffeec24a8b85339237..fb2c248c9b38b6ef62fe477930cf83060b95ee1d 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, float *new_bias); */ + // 32位 float 矩阵乘法 + void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, bool relu, + float *bias); + // 32位 float 矩阵乘法, 并对结果进行 batchnrom void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *new_scale, float *new_bias, float *bias); + void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, const float *B, int ldb, float *C, int ldc, float *p, std::string mode, float *bias, float *bias1); + // 32位 float 矩阵乘法(openmp 多线程版本) + void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu, float *bias); + // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, @@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, template void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a, const int8_t *b, float beta, int32_t *c, Otype *C, - int32_t ldc, bool relu, int32_t *bias); + int32_t ldc, bool relu, int32_t *bias, + bool addOnRow = false); // 8 bits int pack function void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, @@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, template void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A, int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, Btype *bias); + int32_t ldc, bool relu, Btype *bias, bool addOnRow = false); template void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, float beta, - Otype *C, int32_t ldc, bool relu, int32_t *bias); + Otype *C, int32_t ldc, bool relu, int32_t *bias, + bool addOnRow = false); template void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A, int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, Btype *bias); + int32_t ldc, bool relu, Btype *bias, bool addOnRow = false); template void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, int32_t *bias); + int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false); // 8 bits int write back // C = A * B void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc); // C = A * B + bias, scale * relu(C) void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, int32_t ldc, int32_t *bias, float scale); - // C = A * B + bias, scale * C + // C = A * B + bias, scale * C, bias is added on column void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, int32_t ldc, int32_t *bias, float scale); + // C = A * B + bias, scale * C, bias is added on row + void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C, + int32_t ldc, int32_t *bias, float scale); private: int MC = 0; @@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, template void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, float beta, - Otype *C, int32_t ldc, bool relu, int32_t *bias) { + Otype *C, int32_t ldc, bool relu, int32_t *bias, + bool addOnRow) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int32_t L1 = 32 * 1024; @@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta, packedC_int32, &C(i, j), ldc, relu); } else { - InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu, bias + i); + if (addOnRow) { + InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, + packedC_int32, &C(i, j), ldc, relu, bias + j, + addOnRow); + } else { + InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, + packedC_int32, &C(i, j), ldc, relu, bias + i, + addOnRow); + } } } } @@ -339,7 +363,7 @@ template void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C, int32_t ldc, bool relu, - int32_t *bias) { + int32_t *bias, bool addOnRow) { #ifdef _OPENMP int32_t max_threads = omp_get_max_threads(); #else @@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C, &C(i, 0), ldc, relu); } else { - InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C, - &C(i, 0), ldc, relu, bias + i); + if (addOnRow) { + InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, + local_C, &C(i, 0), ldc, relu, bias, addOnRow); + } else { + InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, + local_C, &C(i, 0), ldc, relu, bias + i, addOnRow); + } } } } else { @@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C, &C(0, j), ldc, relu); } else { - InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C, - &C(0, j), ldc, relu, bias); + if (addOnRow) { + InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, + local_C, &C(0, j), ldc, relu, bias + j, addOnRow); + } else { + InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, + local_C, &C(0, j), ldc, relu, bias, addOnRow); + } } } } diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index 1659045c3f3868412d53a578447215a91c4b2d7f..16537adfec989187397009ec88e2a633ca57241f 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -699,7 +699,7 @@ template <> void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a, const int8_t *b, float beta, int32_t *c, int8_t *C, int32_t ldc, bool relu, - int32_t *bias) { + int32_t *bias, bool addOnRow) { #pragma omp parallel for for (int32_t j = 0; j < nc; j += NR_INT8) { for (int32_t i = 0; i < mc; i += MR_INT8) { @@ -716,7 +716,11 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha); return; } else { - WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha); + if (addOnRow) { + WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha); + } else { + WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha); + } } } @@ -724,7 +728,7 @@ template <> void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a, const int8_t *b, float beta, int32_t *c, int32_t *C, int32_t ldc, bool relu, - int32_t *bias) {} + int32_t *bias, bool addOnRow) {} // 8 bits int PackMatrixA_4r void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, @@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, #endif // __ARM_NEON } -// C = A * B + bias, scale * C +// C = A * B + bias, scale * C, bias is added on column void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, int32_t ldc, int32_t *bias, float scale) { #if __ARM_NEON #if __aarch64__ // TODO #else - int32_t zero = 0; int8_t narrow = -128; int32_t nc1 = nc >> 3; int32_t _nc1 = nc & 7; @@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, "subs %[mc], %[mc], #1 \n\t" "blt end_mc_%= \n\t" "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" "vdup.8 d24, %[narrow] \n\t" "loop_mc_%=: \n\t" "vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t" @@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, : : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow) + [scale] "r"(scale), [narrow] "r"(narrow) : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q12", "q13", "q14", "q15"); + "q7", "q12", "q13", "q15"); } int32_t nc_left; @@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, nc_left = _nc1; asm volatile( "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" "vdup.8 d24, %[narrow] \n\t" "vdup.32 q13, %[bias_v] \n\t" "cmp %[_nc1], #4 \n\t" @@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, "subs %[_nc1], %[_nc1], #4 \n\t" "beq process_over_%= \n\t" "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" + "vld1.32 {q0}, [%[c0]] \n\t" "vqadd.s32 q0, q0, q13 \n\t" "vcvt.f32.s32 q1, q0 \n\t" "vmul.f32 q1, q1, q15 \n\t" @@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, "process_over_%=: \n\t" : : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), - [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero), - [narrow] "r"(narrow) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", - "q15"); + [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15"); + } + } +#endif // __aarch64__ +#endif // __ARM_NEON +} + +// C = A * B + bias, scale * C, bias is added on row +void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C, + int32_t ldc, int32_t *bias, float scale) { +#if __ARM_NEON +#if __aarch64__ +// TODO +#else + int8_t narrow = -128; + int32_t nc1 = nc >> 3; + int32_t _nc1 = nc & 7; + int32_t step = sizeof(int8_t) * ldc; + int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); + int32_t volatile m = mc; + int32_t volatile n = nc1; + int32_t *volatile c_ptr, *volatile bias_ptr; + int8_t *volatile C_ptr; + c_ptr = c; + C_ptr = C; + bias_ptr = bias; + if (nc1 > 0) { + asm volatile( + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "vdup.32 q15, %[scale] \n\t" + "vdup.8 d24, %[narrow] \n\t" + "loop_mc_%=: \n\t" + "mov r4, %[bias_ptr] \n\t" + "mov r6, %[C_ptr] \n\t" + "mov r5, %[nc1] \n\t" + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + "vld1.32 {q13, q14}, [r4]! \n\t" + "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" + "vqadd.s32 q0, q0, q13 \n\t" + "vqadd.s32 q1, q1, q14 \n\t" + "vcvt.f32.s32 q2, q0 \n\t" + "vcvt.f32.s32 q3, q1 \n\t" + "vmul.f32 q2, q2, q15 \n\t" + "vmul.f32 q3, q3, q15 \n\t" + "vcvt.s32.f32 q4, q2 \n\t" + "vcvt.s32.f32 q5, q3 \n\t" + "vqmovn.s32 d12, q4 \n\t" + "vqmovn.s32 d13, q5 \n\t" + "vqmovn.s16 d14, q6 \n\t" + "vceq.s8 d15, d14, d24 \n\t" + "vsub.s8 d14, d14, d15 \n\t" + "vst1.8 {d14}, [r6]! \n\t" + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[C_ptr], %[C_ptr], %[step] \n\t" + "add %[c_ptr], %[c_ptr], %[step1] \n\t" + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), + [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), + [scale] "r"(scale), [narrow] "r"(narrow) + : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q12", "q13", "q15"); + } + + int32_t nc_left; + int32_t *c0; + int8_t *C0; + int32_t *volatile bias0 = bias_ptr + nc1 * 8; + if (_nc1 != 0) { + for (int32_t i = 0; i < mc; i++) { + C0 = C_ptr + nc1 * 8 + i * ldc; + c0 = c_ptr + nc1 * 8 + i * NC; + nc_left = _nc1; + asm volatile( + "vdup.32 q15, %[scale] \n\t" + "vdup.8 d24, %[narrow] \n\t" + "cmp %[_nc1], #4 \n\t" + "blt less_four_%= \n\t" + "vld1.32 {q0}, [%[c0]]! \n\t" + "vld1.32 {q13}, [%[bias0]]! \n\t" + "vqadd.s32 q0, q0, q13 \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vmul.f32 q1, q1, q15 \n\t" + "vcvt.s32.f32 q2, q1 \n\t" + "vqmovn.s32 d6, q2 \n\t" + "vqmovn.s16 d8, q3 \n\t" + "vceq.s8 d9, d8, d24 \n\t" + "vsub.s8 d8, d8, d9 \n\t" + "vst1.8 {d8[0]}, [%[C0]]! \n\t" + "vst1.8 {d8[1]}, [%[C0]]! \n\t" + "vst1.8 {d8[2]}, [%[C0]]! \n\t" + "vst1.8 {d8[3]}, [%[C0]]! \n\t" + "subs %[_nc1], %[_nc1], #4 \n\t" + "beq process_over_%= \n\t" + "less_four_%=: \n\t" + "vld1.32 {q0}, [%[c0]] \n\t" + "vld1.32 {q13}, [%[bias0]] \n\t" + "vqadd.s32 q0, q0, q13 \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vmul.f32 q1, q1, q15 \n\t" + "vcvt.s32.f32 q2, q1 \n\t" + "vqmovn.s32 d6, q2 \n\t" + "vqmovn.s16 d8, q3 \n\t" + "vceq.s8 d9, d8, d24 \n\t" + "vsub.s8 d8, d8, d9 \n\t" + "loop_save_%=: \n\t" + "vst1.8 {d8[0]}, [%[C0]]! \n\t" + "vext.8 d8, d8, d8, #1 \n\t" + "subs %[_nc1], %[_nc1], #1 \n\t" + "bgt loop_save_%= \n\t" + "process_over_%=: \n\t" + : + : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0), + [scale] "r"(scale), [narrow] "r"(narrow) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15"); } } #endif // __aarch64__ #endif // __ARM_NEON } -// C = A * B + bias, scale * relu(C) +// C = A * B + bias, scale * relu(C), bias is added on column void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, int32_t ldc, int32_t *bias, float scale) { #if __ARM_NEON diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index c58e8035940c65646851961bc2b9d12307f37e7a..16c39221db5b94dd8ed323c9cced430a58e32e47 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -34,7 +34,7 @@ template void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, framework::Tensor *matrix_out, T beta, bool relu = false, - S *bias = nullptr); + S *bias = nullptr, bool addOnRow = false); template void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp index a407a2915dbe6c17537b85371b9426acfd4a1b2c..b7f634b36fe8d009c06008aada971c61b70b4a46 100644 --- a/src/operators/math/math_function_int8.cpp +++ b/src/operators/math/math_function_int8.cpp @@ -24,8 +24,8 @@ namespace math { template <> void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu, - int32_t *bias) { + framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias, + bool addOnRow) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, #ifdef _OPENMP if (bias != nullptr) { gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } else { gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } #else if (bias != nullptr) { gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } else { gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } #endif } else { @@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, if (bias != nullptr) { gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } else { gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); + matrix_out->data(), N, relu, bias, addOnRow); } #else if (bias != nullptr) { gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); + N, relu, bias, addOnRow); } else { gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); + N, relu, bias, addOnRow); } #endif } diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 381b66199892df9f24eca63470314e7652f5a72a..e90208b5aa9aae2a5af0c1bc00b161f00d45fd06 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam { x_num_col_dims_ = GetAttr("x_num_col_dims", attrs); y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); axis_ = GetAttr("axis", attrs); + +#ifdef FUSION_FC_INT8_OP + scale_ = InputScaleFrom(inputs, scope); +#endif } GType *InputX() const { return input_x_; } @@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam { int x_num_col_dims_; int y_num_col_dims_; int axis_; -#ifdef PADDLE_MOBILE_FPGA + +#ifdef FUSION_FC_INT8_OP + public: + const RType *InputScale() const { return scale_; } + private: + RType *scale_; +#endif + +#ifdef PADDLE_MOBILE_FPGA private: fpga::SplitConvArgs fpga_conv_args; @@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam { typedef typename DtypeTensorTrait::rtype RType; const RType *InputScale() const { return scale_; } - protected: + private: RType *scale_; #endif }; diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp index 2a2505a86b1abab5fe6fd8e0b9905ce7ae78f292..93cea2fd362ea3be42dbc5f53392fb47ad47d1d4 100644 --- a/test/common/test_gemm_accuracy.cpp +++ b/test/common/test_gemm_accuracy.cpp @@ -25,7 +25,7 @@ limitations under the License. */ #define c(i, j) c[(i)*ldc + (j)] #define c1(i, j) c1[(i)*ldc + (j)] -void print_matirx(int m, int n, int ldc, float *c) { +void print_matrix(int m, int n, int ldc, float *c) { for (int i = 0; i < m; ++i) { std::cout << c(i, 0); for (int j = 1; j < n; ++j) { @@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { if (pr > 0) { std::cout << "A:" << std::endl; - print_matirx(m, k, lda, a); + print_matrix(m, k, lda, a); std::cout << "B:" << std::endl; - print_matirx(k, n, ldb, b); + print_matrix(k, n, ldb, b); std::cout << "C:" << std::endl; - print_matirx(m, n, ldc, c); + print_matrix(m, n, ldc, c); std::cout << "C1:" << std::endl; - print_matirx(m, n, ldc, c1); + print_matrix(m, n, ldc, c1); } std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu << " eq=" << eq << " neq=" << neq << std::endl; + PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!"); + paddle_mobile::memory::Free(a); paddle_mobile::memory::Free(b); paddle_mobile::memory::Free(c); diff --git a/test/common/test_gemm_int8_accuracy.cpp b/test/common/test_gemm_int8_accuracy.cpp index a1920ba2bbfd6bf50357fcf05be0cf64dfc9d1fb..493a33af953e6a225e419aaf4fef88901f781732 100644 --- a/test/common/test_gemm_int8_accuracy.cpp +++ b/test/common/test_gemm_int8_accuracy.cpp @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include "../test_helper.h" #include "common/log.h" #include "memory/t_malloc.h" @@ -33,24 +34,32 @@ limitations under the License. */ using std::default_random_engine; using std::uniform_int_distribution; -void print_matirx(int m, int n, int ldc, int32_t *c) { +template +void print_matrix(int m, int n, int ldc, T *c) { for (int i = 0; i < m; ++i) { - std::cout << c(i, 0); - for (int j = 1; j < n; ++j) { - std::cout << " | " << c(i, j); + if (std::is_same::value) { + std::cout.setf(std::ios::left); + std::cout.width(4); + std::cout << static_cast(c(i, 0)); + } else { + std::cout.setf(std::ios::left); + std::cout.width(6); + std::cout << c(i, 0); } - std::cout << std::endl; - } - std::cout << std::endl; -} - -void print_matirx(int m, int n, int ldc, int8_t *c) { - for (int i = 0; i < m; ++i) { - std::cout << static_cast(c(i, 0)); for (int j = 1; j < n; ++j) { - std::cout << " | " << static_cast(c(i, j)); + if (std::is_same::value) { + std::cout << " | "; + std::cout.setf(std::ios::left); + std::cout.width(4); + std::cout << static_cast(c(i, j)); + } else { + std::cout << " | "; + std::cout.setf(std::ios::left); + std::cout.width(6); + std::cout << c(i, j); + } } - std::cout << std::endl; + std::cout << "\n"; } std::cout << std::endl; } @@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { if (pr > 0) { std::cout << "A:" << std::endl; - print_matirx(m, k, lda, a); + print_matrix(m, k, lda, a); std::cout << "B:" << std::endl; - print_matirx(k, n, ldb, b); + print_matrix(k, n, ldb, b); std::cout << "C:" << std::endl; - print_matirx(m, n, ldc, c); + print_matrix(m, n, ldc, c); std::cout << "C1:" << std::endl; - print_matirx(m, n, ldc, c1); + print_matrix(m, n, ldc, c1); } std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu << " eq=" << eq << " neq=" << neq << std::endl; + PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!"); + paddle_mobile::memory::Free(a); paddle_mobile::memory::Free(b); paddle_mobile::memory::Free(c); @@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { return 0; } -int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) { +int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr, + bool addOnRow = false) { int lda = k; int ldb = n; int ldc = n; @@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) { int8_t *c1 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n)); - int32_t *bias = - static_cast(paddle_mobile::memory::Alloc(sizeof(int32_t) * m)); + int32_t *bias = nullptr; + if (addOnRow) { + bias = static_cast( + paddle_mobile::memory::Alloc(sizeof(int32_t) * n)); + } else { + bias = static_cast( + paddle_mobile::memory::Alloc(sizeof(int32_t) * m)); + } for (int i = 0; i < m * k; ++i) { a[i] = pixel(e); @@ -183,29 +201,48 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) { for (int i = 0; i < k * n; ++i) { b[i] = pixel(e); } - for (int i = 0; i < m; ++i) { - bias[i] = static_cast(pixel(e)); - } - for (int i = 0; i < m; ++i) { - int32_t bias_v = bias[i]; - for (int j = 0; j < n; ++j) { - int32_t r = 0; - for (int p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); + + if (addOnRow) { + for (int i = 0; i < n; ++i) { + bias[i] = static_cast(pixel(e)); + } + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + int32_t bias_v = bias[j]; + int32_t r = 0; + for (int p = 0; p < k; p++) { + r += static_cast(a(i, p)) * static_cast(b(p, j)); + } + r = qadd_int32(r, bias_v); + if (relu) r = std::max(0, r); + c1(i, j) = qscale_int32(r, scale); + } + } + } else { + for (int i = 0; i < m; ++i) { + bias[i] = static_cast(pixel(e)); + } + for (int i = 0; i < m; ++i) { + int32_t bias_v = bias[i]; + for (int j = 0; j < n; ++j) { + int32_t r = 0; + for (int p = 0; p < k; p++) { + r += static_cast(a(i, p)) * static_cast(b(p, j)); + } + r = qadd_int32(r, bias_v); + if (relu) r = std::max(0, r); + c1(i, j) = qscale_int32(r, scale); } - r = qadd_int32(r, bias_v); - if (relu) r = std::max(0, r); - c1(i, j) = qscale_int32(r, scale); } } paddle_mobile::operators::math::Gemm gemm; #ifdef _OPENMP gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast(0), c, ldc, - relu, bias); + relu, bias, addOnRow); #else gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast(0), c, ldc, - relu, bias); + relu, bias, addOnRow); #endif int eq = 0; int neq = 0; @@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) { if (pr > 0) { std::cout << "A:" << std::endl; - print_matirx(m, k, lda, a); + print_matrix(m, k, lda, a); std::cout << "B:" << std::endl; - print_matirx(k, n, ldb, b); + print_matrix(k, n, ldb, b); std::cout << "Bias:" << std::endl; - print_matirx(m, 1, 1, bias); + if (addOnRow) { + print_matrix(1, n, n, bias); + } else { + print_matrix(m, 1, 1, bias); + } std::cout << "C:" << std::endl; - print_matirx(m, n, ldc, c); + print_matrix(m, n, ldc, c); std::cout << "C1:" << std::endl; - print_matirx(m, n, ldc, c1); + print_matrix(m, n, ldc, c1); } std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu << " eq=" << eq << " neq=" << neq << std::endl; + PADDLE_MOBILE_ENFORCE(neq == 0, + "The execution of do_sgemm_with_bias is failed!"); + paddle_mobile::memory::Free(a); paddle_mobile::memory::Free(b); paddle_mobile::memory::Free(c); @@ -261,7 +305,7 @@ int main() { std::cout << "\n\n******************************************************\n\n" << std::endl; - std::cout << "Test gemm with bias:" << std::endl; + std::cout << "Test gemm with bias(bias is added on column):" << std::endl; do_sgemm_with_bias(9, 9, 9, false, 1); do_sgemm_with_bias(10, 6, 12, false, 0); do_sgemm_with_bias(512, 256, 384, false, 0); @@ -272,6 +316,19 @@ int main() { do_sgemm_with_bias(333, 797, 939, false, 0); do_sgemm_with_bias(1024, 1024, 1024, false, 0); + std::cout << "\n\n******************************************************\n\n" + << std::endl; + std::cout << "Test gemm with bias(bias is added on row):" << std::endl; + do_sgemm_with_bias(9, 9, 9, false, 1, true); + do_sgemm_with_bias(10, 6, 12, false, 0, true); + do_sgemm_with_bias(512, 256, 384, false, 0, true); + do_sgemm_with_bias(1366, 768, 256, false, 0, true); + do_sgemm_with_bias(1255, 755, 333, false, 0, true); + do_sgemm_with_bias(599, 1133, 393, false, 0, true); + do_sgemm_with_bias(777, 555, 999, false, 0, true); + do_sgemm_with_bias(333, 797, 939, false, 0, true); + do_sgemm_with_bias(1024, 1024, 1024, false, 0, true); + std::cout << "\n\n******************************************************\n\n" << std::endl; std::cout << "Test gemm with relu and bias:" << std::endl; diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp index f25a290aef6e228aff0a84d2640486235e0116bf..92b78a57e9a0236ce2e1c6627b150d4c246c5413 100644 --- a/test/common/test_gemm_perf.cpp +++ b/test/common/test_gemm_perf.cpp @@ -49,7 +49,8 @@ int main() { auto bbptr_int8 = bb_int8.mutable_data({k, n}); auto ccptr_int32 = cc_int32.mutable_data({m, n}); auto ccptr_int8 = cc_int8.mutable_data({m, n}); - int32_t* bias_data = new int32_t[m]; + int32_t* bias_data_col = new int32_t[m]; + int32_t* bias_data_row = new int32_t[n]; for (int i = 0; i < m * k; ++i) { aaptr_int8[i] = static_cast(2); @@ -62,7 +63,11 @@ int main() { } for (int i = 0; i < m; ++i) { - bias_data[i] = 2; + bias_data_col[i] = 2; + } + + for (int i = 0; i < n; ++i) { + bias_data_row[i] = 2; } // float @@ -73,14 +78,15 @@ int main() { false, nullptr); } - auto time1 = time(); + auto time_start0 = time(); for (int j = 0; j < 10; ++j) { paddle_mobile::operators::math::matmul( aa, false, bb, false, static_cast(1), &cc, static_cast(0), false, nullptr); } - auto time2 = time(); - std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; + auto time_end0 = time(); + std::cout << "float gemm cost :" << time_diff(time_start0, time_end0) / 10 + << "ms\n"; // int8_t without bias // warm-up 10 times @@ -90,33 +96,69 @@ int main() { static_cast(0)); } - auto time3 = time(); + auto time_start1 = time(); for (int j = 0; j < 10; ++j) { paddle_mobile::operators::math::matmul( aa_int8, false, bb_int8, false, static_cast(1), &cc_int32, static_cast(0)); } - auto time4 = time(); - std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n"; + auto time_end1 = time(); + std::cout << "int8_t gemm cost :" << time_diff(time_start1, time_end1) / 10 + << "ms\n"; + + // int8_t with bias, column element wise add + // warm-up 10 times + for (int j = 0; j < 10; ++j) { + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, + static_cast(0), false, bias_data_col, false); + } + auto time_start2 = time(); + for (int j = 0; j < 10; ++j) { + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, + static_cast(0), false, bias_data_col, false); + } + auto time_end2 = time(); + std::cout << "int8_t gemm_with_bias(column add) cost :" + << time_diff(time_start2, time_end2) / 10 << "ms\n"; + + // int8_t with bias, row element wise add + // warm-up 10 times + for (int j = 0; j < 10; ++j) { + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, + static_cast(0), false, bias_data_row, true); + } + auto time_start3 = time(); + for (int j = 0; j < 10; ++j) { + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, + static_cast(0), false, bias_data_row, true); + } + auto time_end3 = time(); + std::cout << "int8_t gemm_with_bias(row add) cost :" + << time_diff(time_start3, time_end3) / 10 << "ms\n"; // int8_t with bias&relu // warm-up 10 times for (int j = 0; j < 10; ++j) { paddle_mobile::operators::math::matmul( aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), true, bias_data); + static_cast(0), true, bias_data_col, false); } - auto time5 = time(); + auto time_start4 = time(); for (int j = 0; j < 10; ++j) { paddle_mobile::operators::math::matmul( aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), true, bias_data); + static_cast(0), true, bias_data_col, false); } - auto time6 = time(); + auto time_end4 = time(); std::cout << "int8_t gemm_with_bias_relu cost :" - << time_diff(time5, time6) / 10 << "ms\n"; + << time_diff(time_start4, time_end4) / 10 << "ms\n"; - delete[] bias_data; + delete[] bias_data_row; + delete[] bias_data_col; return 0; } diff --git a/test/operators/test_fusion_conv_add_relu_int8_op.cpp b/test/operators/test_fusion_conv_add_relu_int8_op.cpp index 42c68e5d04c03c143517a917e620d40636c382ec..8d00e5bf844ba6f19d9edc3a5c84e7e1c52f19fa 100644 --- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp +++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef FUSION_CONVADDRELU_INT8_OP - #include + +#ifdef FUSION_CONVADDRELU_INT8_OP #include #include "../test_helper.h" #include "../test_include.h" @@ -356,5 +356,9 @@ int main(int argc, char *argv[]) { paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); } - +#else +int main() { + std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl; + return 0; +} #endif diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp index a8ec4883aab4218aa526e7b90267998754d1eb30..34de0292d0e5bc460f402c08a22fe60b02e7ab0c 100644 --- a/test/operators/test_fusion_fc_op.cpp +++ b/test/operators/test_fusion_fc_op.cpp @@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include +#include +#include "../test_helper.h" #include "../test_include.h" +#include "framework/operator.h" +#include "operators/fusion_fc_int8_op.h" #include "operators/fusion_fc_op.h" +#define a(i, j) a[(i)*lda + (j)] +#define b(i, j) b[(i)*ldb + (j)] +#define c(i, j) c[(i)*ldc + (j)] + namespace paddle_mobile { -namespace framework { +using framework::AttributeMap; +using framework::DDim; +using framework::Scope; +using framework::make_ddim; + +int32_t qadd_int32(int32_t l, int32_t r) { + int64_t res = static_cast(l) + static_cast(r); + if (res > std::numeric_limits::max()) + return std::numeric_limits::max(); + else if (res < std::numeric_limits::min()) + return std::numeric_limits::min(); + else + return static_cast(res); +} -template -class TestFcOp { - public: - explicit TestFcOp(const Program p) : program_(p) { - use_optimize_ = true; - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } +// round to zero +float round2zero(float v) { + float res; + if (v > 0) + res = std::floor(v); + else if (v < 0) + res = std::ceil(v); + return res; +} - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (int i = 0; i < blocks.size(); ++i) { - std::shared_ptr block_desc = blocks[i]; - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (int j = 0; j < ops.size(); ++j) { - std::shared_ptr op = ops[j]; - if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") { - DLOG << " fc attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " Input X is : " << op->Input("X")[0]; - DLOG << " Input Y is : " << op->Input("Y")[0]; - DLOG << " Input Y is : " << op->Input("Z")[0]; - DLOG << " Output Out is : " << op->Output("Out")[0]; - std::shared_ptr> testOp = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope); - ops_of_block_[*block_desc.get()].push_back(testOp); +int8_t qscale_int32(int32_t v, float scale) { + float res = static_cast(v) * scale; + res = round2zero(res); + if (res > 127) + return static_cast(127); + else if (res < -127) + return static_cast(-127); + else + return static_cast(res); +} + +template +int TestFcOP() { + int32_t m = 377; + int32_t n = 1363; + int32_t k = 577; + int32_t lda = k; + int32_t ldb = n; + int32_t ldc = n; + DDim inputA_shape = make_ddim({m, k}); + DDim inputB_shape = make_ddim({k, n}); + DDim bias_shape = make_ddim({n}); + VariableNameMap inputs; + VariableNameMap outputs; + auto scope = std::make_shared(); + inputs["X"] = std::vector({"inputA"}); + inputs["Y"] = std::vector({"inputB"}); + inputs["Z"] = std::vector({"bias"}); + inputs["Scale"] = std::vector({"scale"}); + outputs["Out"] = std::vector({"output"}); + + auto inputA_var = scope.get()->Var("inputA"); + auto inputA = inputA_var->template GetMutable(); + SetupTensor(inputA, inputA_shape, -127, 127); + auto inputB_var = scope.get()->Var("inputB"); + auto inputB = inputB_var->template GetMutable(); + SetupTensor(inputB, inputB_shape, -127, 127); + auto bias_var = scope.get()->Var("bias"); + auto bias = bias_var->template GetMutable(); + SetupTensor(bias, bias_shape, -127, 127); + + auto scale_var = scope.get()->Var("scale"); + auto scale = scale_var->template GetMutable(); + scale->Resize(framework::make_ddim({1})); + float scale_v = 0.000828f; + scale->mutable_data()[0] = scale_v; + + auto output_var = scope.get()->Var("output"); + AttributeMap attrs; + attrs["x_num_col_dims"].Set(1); + attrs["y_num_col_dims"].Set(1); + attrs["axis"].Set(1); + operators::OperatorBase *op = nullptr; +#ifdef FUSION_FC_INT8_OP + if (std::is_same::value) { + op = new operators::FusionFcInt8Op("fusion_fc_int8", inputs, + outputs, attrs, scope); + } else { + op = new operators::FusionFcOp("fusion_fc", inputs, outputs, attrs, + scope); + } +#else + op = new operators::FusionFcOp("fusion_fc", inputs, outputs, attrs, + scope); +#endif + op->InferShape(); + op->Run(); + auto output = output_var->template Get(); + const T *output_data = output->data(); + // compare + T *c = static_cast(memory::Alloc(sizeof(T) * m * n)); + T *a = inputA->data(); + T *b = inputB->data(); + S *bias_data = bias->data(); + for (int32_t i = 0; i < m; ++i) { + for (int32_t j = 0; j < n; ++j) { + S bias_v = bias_data[j]; + if (std::is_same::value) { + int32_t r = 0; + for (int32_t p = 0; p < k; p++) { + r += static_cast(a(i, p)) * static_cast(b(p, j)); } + r = qadd_int32(r, bias_v); + c(i, j) = qscale_int32(r, scale_v); + } else { + T r = 0; + for (int32_t p = 0; p < k; p++) { + r += a(i, p) * b(p, j); + } + r += bias_v; + c(i, j) = r; } } } - std::shared_ptr predict(const Tensor &t1, const Tensor &t2, - const Tensor &t3) { - // feed - auto scope = program_.scope; - Variable *x_feed_value = scope->Var("pool2d_13.tmp_0"); - auto tensor_x = x_feed_value->GetMutable(); - tensor_x->ShareDataWith(t1); - - Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights"); - auto tensor_y = y_feed_value->GetMutable(); - tensor_y->ShareDataWith(t2); - - Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases"); - auto tensor_z = z_feed_value->GetMutable(); - tensor_z->ShareDataWith(t3); - - Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1"); - auto *output_tensor = con_output->GetMutable(); - output_tensor->mutable_data({3, 10}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(t1, t2, t3, 0); - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3, - int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); + int32_t eq = 0; + int32_t neq = 0; + for (int32_t i = 0; i < m * n; ++i) { + PADDLE_MOBILE_ENFORCE(output_data[i] == c[i], + "The execution of test_fusion_fc_op is failed!"); + if (output_data[i] == c[i]) { + ++eq; + } else { + ++neq; } } -}; - -template class TestFcOp; -} // namespace framework + std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq + << " neq=" << neq << std::endl; + delete op; + return 0; +} } // namespace paddle_mobile -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run Fc Test"; - paddle_mobile::framework::Loader loader; - // "../../../test/models/googlenet" - auto program = loader.Load(g_googlenet); - paddle_mobile::framework::ProgramOptimize optimize; - // program.originProgram->Description("origin"); - auto optimize_program = optimize.FusionOptimize(program.originProgram); - - program.optimizeProgram = optimize_program; - - if (optimize_program != nullptr) { - optimize_program->Description("optimize"); - } else { - LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null"; - } - - /// input x (1,3,224,224) - paddle_mobile::framework::LoDTensor inputx; - SetupTensor(&inputx, {3, 64, 1, 1}, static_cast(1), - static_cast(1)); - auto *inputx_ptr = inputx.data(); - /// input y (224,) - paddle_mobile::framework::LoDTensor inputy; - SetupTensor(&inputy, {64, 10}, static_cast(1.5), - static_cast(1.5)); - auto *inputy_ptr = inputy.data(); - - paddle_mobile::framework::LoDTensor inputz; - SetupTensor(&inputz, {10}, static_cast(0), - static_cast(1)); - auto *inputz_ptr = inputz.data(); - - paddle_mobile::framework::TestFcOp testFcOp(program); - - auto output = testFcOp.predict(inputx, inputy, inputz); - auto *output_ptr = output->data(); - for (int j = 0; j < output->numel(); ++j) { - DLOG << "value of output: " << output_ptr[j]; - } - - DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)"; - DLOG << "output : 96(3,10) + bias(10)"; +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); +#ifdef FUSION_FC_INT8_OP + paddle_mobile::TestFcOP(); +#endif + paddle_mobile::TestFcOP(); return 0; } diff --git a/tools/op.cmake b/tools/op.cmake index 52d745565cedc81a0eeac49dda56dab08ffa1dc0..34e74a22e5d3db3fc94f91f713c1251c7de3122e 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -214,6 +214,7 @@ if(NOT FOUND_MATCH) set(FUSION_CONVADDPRELU_OP ON) set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADDRELU_INT8_OP ON) + set(FUSION_FC_INT8_OP ON) set(FUSION_FC_OP ON) set(LRN_OP ON) set(MUL_OP ON) @@ -322,6 +323,9 @@ endif() if (FUSION_FC_OP) add_definitions(-DFUSION_FC_OP) endif() +if(FUSION_FC_INT8_OP) + add_definitions(-DFUSION_FC_INT8_OP) +endif() if (LRN_OP) add_definitions(-DLRN_OP) endif()