提交 6ce11736 编写于 作者: J Jiaying Zhao 提交者: GitHub

Merge pull request #1336 from wzzju/add_fusion_fc_int8_op

add fusion fc int8_t op and its UT.
......@@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul";
......@@ -111,12 +112,13 @@ std::unordered_map<
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
......
......@@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FC_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
......
......@@ -22,19 +22,19 @@ namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>> {
: public framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
: framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionFcInt8Op<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
auto y_dims = this->param_.InputY()->dims();
int x_num_col_dims = this->param_.XNumColDims();
int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
assert(x_mat_dims[1] == y_mat_dims[0]);
std::vector<int64_t> output_dims;
output_dims.reserve(
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
for (int i = 0; i < x_num_col_dims; ++i) {
output_dims.push_back(x_dims[i]);
}
for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
output_dims.push_back(y_dims[i]);
}
framework::DDim ddim = framework::make_ddim(output_dims);
this->param_.Out()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
#endif
#endif // FUSION_FC_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionFcInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>> {
public:
FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
......@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
template <>
void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
FusionFcCompute<float>(param);
FusionFcCompute<float, float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
template class FusionFcKernel<CPU, float>;
#ifdef FUSION_FC_INT8_OP
template <>
bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
return true;
}
template <>
void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
FusionFcCompute<int8_t, int32_t>(param);
param.Out()->set_lod(param.InputX()->lod());
}
template class FusionFcKernel<CPU, int8_t>;
#endif
} // namespace operators
} // namespace paddle_mobile
......
......@@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
float beta = 1.0f;
#ifdef FUSION_CONVADDRELU_INT8_OP
Tensor scale = *param.InputScale();
alpha = scale.data<float>()[0];
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
#endif
......
......@@ -15,23 +15,29 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include <type_traits>
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
template <typename P, typename S>
void FusionFcCompute(const FusionFcParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
const Tensor *input_z = param.InputZ();
auto *input_z_data = input_z->data<float>();
Tensor *input_z = param.InputZ();
S *input_z_data = input_z->data<S>();
int axis = param.Axis();
Tensor *out = param.Out();
// int m = out->dims()[0];
// int n = out->dims()[1];
auto *out_data = out->mutable_data<float>();
auto *out_data = out->mutable_data<P>();
float alpha = 1.0f;
float beta = 1.0f;
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
......@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
}
if (std::is_same<P, int8_t>::value) {
#ifdef FUSION_FC_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
input_z_data, true);
#endif
} else {
// bias_data的维度和out的第二个维度一致
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data,
sizeof(float) * classes);
}
// for (int i = 0; i < out->numel(); i++) {
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1), false);
math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
false);
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
} // namespace operators
......
......@@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __ARM_NEON
// 32位 float 矩阵乘法
template <>
void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) {
......@@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
}
// 32位 float 矩阵乘法
template <>
void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) {
......
......@@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
......@@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template <typename Otype>
void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
int32_t ldc, bool relu, int32_t *bias,
bool addOnRow = false);
// 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
......@@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template <typename Itype, typename Btype, typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
template <typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias);
Otype *C, int32_t ldc, bool relu, int32_t *bias,
bool addOnRow = false);
template <typename Itype, typename Btype, typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
template <typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
// 8 bits int write back
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + bias, scale * relu(C)
void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C, bias is added on column
void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
// C = A * B + bias, scale * C, bias is added on row
void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
private:
int MC = 0;
......@@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template <typename Otype>
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias) {
Otype *C, int32_t ldc, bool relu, int32_t *bias,
bool addOnRow) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
......@@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i);
if (addOnRow) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + j,
addOnRow);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i,
addOnRow);
}
}
}
}
......@@ -339,7 +363,7 @@ template <typename Otype>
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
float beta, Otype *C, int32_t ldc, bool relu,
int32_t *bias) {
int32_t *bias, bool addOnRow) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
#else
......@@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu);
} else {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu, bias + i);
if (addOnRow) {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
local_C, &C(i, 0), ldc, relu, bias, addOnRow);
} else {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
}
}
}
} else {
......@@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu);
} else {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu, bias);
if (addOnRow) {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
} else {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
local_C, &C(0, j), ldc, relu, bias, addOnRow);
}
}
}
}
......
......@@ -699,7 +699,7 @@ template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int8_t *C, int32_t ldc, bool relu,
int32_t *bias) {
int32_t *bias, bool addOnRow) {
#pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR_INT8) {
for (int32_t i = 0; i < mc; i += MR_INT8) {
......@@ -716,7 +716,11 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
return;
} else {
WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
if (addOnRow) {
WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
} else {
WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
}
}
}
......@@ -724,7 +728,7 @@ template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int32_t *bias) {}
int32_t *bias, bool addOnRow) {}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
......@@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C, bias is added on column
void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t zero = 0;
int8_t narrow = -128;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
......@@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"loop_mc_%=: \n\t"
"vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t"
......@@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow)
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q12", "q13", "q14", "q15");
"q7", "q12", "q13", "q15");
}
int32_t nc_left;
......@@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"vdup.32 q13, %[bias_v] \n\t"
"cmp %[_nc1], #4 \n\t"
......@@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vld1.32 {q0}, [%[c0]] \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
......@@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
[bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero),
[narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
"q15");
[bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C, bias is added on row
void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int8_t narrow = -128;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"loop_mc_%=: \n\t"
"mov r4, %[bias_ptr] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q13, q14}, [r4]! \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q14 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vceq.s8 d15, d14, d24 \n\t"
"vsub.s8 d14, d14, d15 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
"q6", "q7", "q12", "q13", "q15");
}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t *volatile bias0 = bias_ptr + nc1 * 8;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vld1.32 {q13}, [%[bias0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]] \n\t"
"vld1.32 {q13}, [%[bias0]] \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * relu(C)
// C = A * B + bias, scale * relu(C), bias is added on column
void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
......
......@@ -34,7 +34,7 @@ template <typename T, typename S>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
S *bias = nullptr);
S *bias = nullptr, bool addOnRow = false);
template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
......@@ -24,8 +24,8 @@ namespace math {
template <>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu,
int32_t *bias) {
framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
bool addOnRow) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#ifdef _OPENMP
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#else
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#endif
} else {
......@@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#else
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
N, relu, bias);
N, relu, bias, addOnRow);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
N, relu, bias);
N, relu, bias, addOnRow);
}
#endif
}
......
......@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
axis_ = GetAttr<int>("axis", attrs);
#ifdef FUSION_FC_INT8_OP
scale_ = InputScaleFrom<GType>(inputs, scope);
#endif
}
GType *InputX() const { return input_x_; }
......@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
int x_num_col_dims_;
int y_num_col_dims_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
#ifdef FUSION_FC_INT8_OP
public:
const RType *InputScale() const { return scale_; }
private:
RType *scale_;
#endif
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
......@@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
const RType *InputScale() const { return scale_; }
protected:
private:
RType *scale_;
#endif
};
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
void print_matirx(int m, int n, int ldc, float *c) {
void print_matrix(int m, int n, int ldc, float *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
......@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
print_matrix(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <iostream>
#include <limits>
#include <random>
#include <type_traits>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
......@@ -33,24 +34,32 @@ limitations under the License. */
using std::default_random_engine;
using std::uniform_int_distribution;
void print_matirx(int m, int n, int ldc, int32_t *c) {
template <typename T>
void print_matrix(int m, int n, int ldc, T *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
if (std::is_same<T, int8_t>::value) {
std::cout.setf(std::ios::left);
std::cout.width(4);
std::cout << static_cast<int32_t>(c(i, 0));
} else {
std::cout.setf(std::ios::left);
std::cout.width(6);
std::cout << c(i, 0);
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void print_matirx(int m, int n, int ldc, int8_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << static_cast<int32_t>(c(i, 0));
for (int j = 1; j < n; ++j) {
std::cout << " | " << static_cast<int32_t>(c(i, j));
if (std::is_same<T, int8_t>::value) {
std::cout << " | ";
std::cout.setf(std::ios::left);
std::cout.width(4);
std::cout << static_cast<int32_t>(c(i, j));
} else {
std::cout << " | ";
std::cout.setf(std::ios::left);
std::cout.width(6);
std::cout << c(i, j);
}
}
std::cout << std::endl;
std::cout << "\n";
}
std::cout << std::endl;
}
......@@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
print_matrix(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
......@@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
return 0;
}
int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
bool addOnRow = false) {
int lda = k;
int ldb = n;
int ldc = n;
......@@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
int8_t *c1 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
int32_t *bias =
static_cast<int32_t *>(paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
int32_t *bias = nullptr;
if (addOnRow) {
bias = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
} else {
bias = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
}
for (int i = 0; i < m * k; ++i) {
a[i] = pixel(e);
......@@ -183,29 +201,48 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
for (int i = 0; i < k * n; ++i) {
b[i] = pixel(e);
}
for (int i = 0; i < m; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
int32_t bias_v = bias[i];
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
if (addOnRow) {
for (int i = 0; i < n; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
int32_t bias_v = bias[j];
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
}
} else {
for (int i = 0; i < m; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
int32_t bias_v = bias[i];
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
}
paddle_mobile::operators::math::Gemm gemm;
#ifdef _OPENMP
gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
relu, bias, addOnRow);
#else
gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
relu, bias, addOnRow);
#endif
int eq = 0;
int neq = 0;
......@@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
print_matrix(k, n, ldb, b);
std::cout << "Bias:" << std::endl;
print_matirx(m, 1, 1, bias);
if (addOnRow) {
print_matrix(1, n, n, bias);
} else {
print_matrix(m, 1, 1, bias);
}
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0,
"The execution of do_sgemm_with_bias is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
......@@ -261,7 +305,7 @@ int main() {
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with bias:" << std::endl;
std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
do_sgemm_with_bias(9, 9, 9, false, 1);
do_sgemm_with_bias(10, 6, 12, false, 0);
do_sgemm_with_bias(512, 256, 384, false, 0);
......@@ -272,6 +316,19 @@ int main() {
do_sgemm_with_bias(333, 797, 939, false, 0);
do_sgemm_with_bias(1024, 1024, 1024, false, 0);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
do_sgemm_with_bias(9, 9, 9, false, 1, true);
do_sgemm_with_bias(10, 6, 12, false, 0, true);
do_sgemm_with_bias(512, 256, 384, false, 0, true);
do_sgemm_with_bias(1366, 768, 256, false, 0, true);
do_sgemm_with_bias(1255, 755, 333, false, 0, true);
do_sgemm_with_bias(599, 1133, 393, false, 0, true);
do_sgemm_with_bias(777, 555, 999, false, 0, true);
do_sgemm_with_bias(333, 797, 939, false, 0, true);
do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with relu and bias:" << std::endl;
......
......@@ -49,7 +49,8 @@ int main() {
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
int32_t* bias_data = new int32_t[m];
int32_t* bias_data_col = new int32_t[m];
int32_t* bias_data_row = new int32_t[n];
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
......@@ -62,7 +63,11 @@ int main() {
}
for (int i = 0; i < m; ++i) {
bias_data[i] = 2;
bias_data_col[i] = 2;
}
for (int i = 0; i < n; ++i) {
bias_data_row[i] = 2;
}
// float
......@@ -73,14 +78,15 @@ int main() {
false, nullptr);
}
auto time1 = time();
auto time_start0 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
}
auto time2 = time();
std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
auto time_end0 = time();
std::cout << "float gemm cost :" << time_diff(time_start0, time_end0) / 10
<< "ms\n";
// int8_t without bias
// warm-up 10 times
......@@ -90,33 +96,69 @@ int main() {
static_cast<float>(0));
}
auto time3 = time();
auto time_start1 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0));
}
auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
auto time_end1 = time();
std::cout << "int8_t gemm cost :" << time_diff(time_start1, time_end1) / 10
<< "ms\n";
// int8_t with bias, column element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time_start2 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time_end2 = time();
std::cout << "int8_t gemm_with_bias(column add) cost :"
<< time_diff(time_start2, time_end2) / 10 << "ms\n";
// int8_t with bias, row element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_start3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_end3 = time();
std::cout << "int8_t gemm_with_bias(row add) cost :"
<< time_diff(time_start3, time_end3) / 10 << "ms\n";
// int8_t with bias&relu
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data);
static_cast<float>(0), true, bias_data_col, false);
}
auto time5 = time();
auto time_start4 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data);
static_cast<float>(0), true, bias_data_col, false);
}
auto time6 = time();
auto time_end4 = time();
std::cout << "int8_t gemm_with_bias_relu cost :"
<< time_diff(time5, time6) / 10 << "ms\n";
<< time_diff(time_start4, time_end4) / 10 << "ms\n";
delete[] bias_data;
delete[] bias_data_row;
delete[] bias_data_col;
return 0;
}
......@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <iostream>
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <limits>
#include "../test_helper.h"
#include "../test_include.h"
......@@ -356,5 +356,9 @@ int main(int argc, char *argv[]) {
paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
out_channels);
}
#else
int main() {
std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl;
return 0;
}
#endif
......@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <framework/program/program-optimize/program_optimize.h>
#include <iostream>
#include <type_traits>
#include "../test_helper.h"
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_fc_op.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
namespace paddle_mobile {
namespace framework {
using framework::AttributeMap;
using framework::DDim;
using framework::Scope;
using framework::make_ddim;
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
template <typename Dtype>
class TestFcOp {
public:
explicit TestFcOp(const Program<Dtype> p) : program_(p) {
use_optimize_ = true;
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
const std::vector<std::shared_ptr<BlockDesc>> blocks =
to_predict_program_->Blocks();
// DLOG << " **block size " << blocks.size();
for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
// DLOG << " ops " << ops.size();
for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<OpDesc> op = ops[j];
if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") {
DLOG << " fc attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size();
DLOG << " Input X is : " << op->Input("X")[0];
DLOG << " Input Y is : " << op->Input("Y")[0];
DLOG << " Input Y is : " << op->Input("Z")[0];
DLOG << " Output Out is : " << op->Output("Out")[0];
std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
std::make_shared<operators::FusionFcOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope);
ops_of_block_[*block_desc.get()].push_back(testOp);
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
template <typename T, typename S>
int TestFcOP() {
int32_t m = 377;
int32_t n = 1363;
int32_t k = 577;
int32_t lda = k;
int32_t ldb = n;
int32_t ldc = n;
DDim inputA_shape = make_ddim({m, k});
DDim inputB_shape = make_ddim({k, n});
DDim bias_shape = make_ddim({n});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<Scope>();
inputs["X"] = std::vector<std::string>({"inputA"});
inputs["Y"] = std::vector<std::string>({"inputB"});
inputs["Z"] = std::vector<std::string>({"bias"});
inputs["Scale"] = std::vector<std::string>({"scale"});
outputs["Out"] = std::vector<std::string>({"output"});
auto inputA_var = scope.get()->Var("inputA");
auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputA, inputA_shape, -127, 127);
auto inputB_var = scope.get()->Var("inputB");
auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputB, inputB_shape, -127, 127);
auto bias_var = scope.get()->Var("bias");
auto bias = bias_var->template GetMutable<framework::LoDTensor>();
SetupTensor<S>(bias, bias_shape, -127, 127);
auto scale_var = scope.get()->Var("scale");
auto scale = scale_var->template GetMutable<framework::LoDTensor>();
scale->Resize(framework::make_ddim({1}));
float scale_v = 0.000828f;
scale->mutable_data<float>()[0] = scale_v;
auto output_var = scope.get()->Var("output");
AttributeMap attrs;
attrs["x_num_col_dims"].Set<int>(1);
attrs["y_num_col_dims"].Set<int>(1);
attrs["axis"].Set<int>(1);
operators::OperatorBase<CPU> *op = nullptr;
#ifdef FUSION_FC_INT8_OP
if (std::is_same<T, int8_t>::value) {
op = new operators::FusionFcInt8Op<CPU, T>("fusion_fc_int8", inputs,
outputs, attrs, scope);
} else {
op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
scope);
}
#else
op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
scope);
#endif
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
// compare
T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
T *a = inputA->data<T>();
T *b = inputB->data<T>();
S *bias_data = bias->data<S>();
for (int32_t i = 0; i < m; ++i) {
for (int32_t j = 0; j < n; ++j) {
S bias_v = bias_data[j];
if (std::is_same<T, int8_t>::value) {
int32_t r = 0;
for (int32_t p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
c(i, j) = qscale_int32(r, scale_v);
} else {
T r = 0;
for (int32_t p = 0; p < k; p++) {
r += a(i, p) * b(p, j);
}
r += bias_v;
c(i, j) = r;
}
}
}
std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2,
const Tensor &t3) {
// feed
auto scope = program_.scope;
Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
tensor_x->ShareDataWith(t1);
Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
tensor_y->ShareDataWith(t2);
Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
tensor_z->ShareDataWith(t3);
Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
auto *output_tensor = con_output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({3, 10});
// DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims();
std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
predict(t1, t2, t3, 0);
return out_tensor;
}
private:
const framework::Program<Dtype> program_;
std::shared_ptr<ProgramDesc> to_predict_program_;
std::map<framework::BlockDesc,
std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
ops_of_block_;
bool use_optimize_ = false;
void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3,
int block_id) {
std::shared_ptr<BlockDesc> to_predict_block =
to_predict_program_->Block(block_id);
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
auto op = ops_of_block_[*to_predict_block.get()][j];
DLOG << "op -> run()";
op->Run();
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
"The execution of test_fusion_fc_op is failed!");
if (output_data[i] == c[i]) {
++eq;
} else {
++neq;
}
}
};
template class TestFcOp<CPU>;
} // namespace framework
std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main() {
DLOG << "----------**********----------";
DLOG << "begin to run Fc Test";
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
// "../../../test/models/googlenet"
auto program = loader.Load(g_googlenet);
paddle_mobile::framework::ProgramOptimize optimize;
// program.originProgram->Description("origin");
auto optimize_program = optimize.FusionOptimize(program.originProgram);
program.optimizeProgram = optimize_program;
if (optimize_program != nullptr) {
optimize_program->Description("optimize");
} else {
LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
}
/// input x (1,3,224,224)
paddle_mobile::framework::LoDTensor inputx;
SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
static_cast<float>(1));
auto *inputx_ptr = inputx.data<float>();
/// input y (224,)
paddle_mobile::framework::LoDTensor inputy;
SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
static_cast<float>(1.5));
auto *inputy_ptr = inputy.data<float>();
paddle_mobile::framework::LoDTensor inputz;
SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
static_cast<float>(1));
auto *inputz_ptr = inputz.data<float>();
paddle_mobile::framework::TestFcOp<paddle_mobile::CPU> testFcOp(program);
auto output = testFcOp.predict(inputx, inputy, inputz);
auto *output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) {
DLOG << "value of output: " << output_ptr[j];
}
DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)";
DLOG << "output : 96(3,10) + bias(10)";
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
#ifdef FUSION_FC_INT8_OP
paddle_mobile::TestFcOP<int8_t, int32_t>();
#endif
paddle_mobile::TestFcOP<float, float>();
return 0;
}
......@@ -214,6 +214,7 @@ if(NOT FOUND_MATCH)
set(FUSION_CONVADDPRELU_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDRELU_INT8_OP ON)
set(FUSION_FC_INT8_OP ON)
set(FUSION_FC_OP ON)
set(LRN_OP ON)
set(MUL_OP ON)
......@@ -322,6 +323,9 @@ endif()
if (FUSION_FC_OP)
add_definitions(-DFUSION_FC_OP)
endif()
if(FUSION_FC_INT8_OP)
add_definitions(-DFUSION_FC_INT8_OP)
endif()
if (LRN_OP)
add_definitions(-DLRN_OP)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册