提交 f17e8bf5 编写于 作者: Z ZhenWang

update the code style according suggestions.

上级 4e2eaa77
......@@ -34,7 +34,7 @@ endif()
if(DEBUGING)
message(STATUS "debugging mode")
add_definitions(-DPADDLE_MOBILE_DEBUG)
# add_definitions(-DPADDLE_MOBILE_DEBUG)
else()
endif()
......
......@@ -114,7 +114,7 @@ std::unordered_map<
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
......
......@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time();
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
static_cast<float>(0), c, ldc, false, nullptr);
static_cast<float>(0), c, ldc, false,
static_cast<float *>(nullptr));
auto time2 = paddle_mobile::time();
double cost = paddle_mobile::time_diff(time1, time2);
paddle_mobile::memory::Free(a);
......
......@@ -16,28 +16,26 @@ limitations under the License. */
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_add_relu_int8_kernel.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddReluInt8Param<DeviceType>,
operators::ConvAddReluInt8Kernel<DeviceType, T>> {
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const string &type, const VariableNameMap &inputs,
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddReluInt8Param<DeviceType>,
operators::ConvAddReluInt8Kernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include "operators/kernel/conv_add_relu_int8_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddReluInt8Kernel<CPU, int8_t>::Init(
FusionConvAddReluInt8Param<CPU> *param) {
return true;
}
template <>
void ConvAddReluInt8Kernel<CPU, int8_t>::Compute(
const FusionConvAddReluInt8Param<CPU> &param) {
ConvAddReluInt8Compute<int8_t>(param);
}
template class ConvAddReluInt8Kernel<CPU, int8_t>;
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
......@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<float>(param);
ConvAddReluCompute<float, float>(param);
}
template class ConvAddReluKernel<CPU, float>;
#ifdef FUSION_CONVADDRELU_INT8_OP
template <>
bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddReluKernel<CPU, int8_t>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<int8_t, int32_t>(param);
}
template class ConvAddReluKernel<CPU, int8_t>;
#endif
} // namespace operators
} // namespace paddle_mobile
......
......@@ -25,22 +25,31 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <typename P>
template <typename P, typename S>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
int32_t axis = param.Axis();
S *bias_data = bias.data<S>();
Tensor *output = param.Output();
float *biase_data = bias.data<float>();
output->mutable_data<P>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
float alpha = 1.0f;
float beta = 1.0f;
const int batch_size = static_cast<int>(input->dims()[0]);
#ifdef FUSION_CONVADDRELU_INT8_OP
Tensor scale = *param.InputScale();
alpha = scale.data<float>()[0];
beta = 0.0f;
#endif
int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
std::vector<int32_t> dilations = param.Dilations();
const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
......@@ -62,13 +71,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col.mutable_data<P>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
input->dims(), 1, static_cast<int32_t>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
......@@ -78,17 +87,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
math::Vol2ColFunctor<CPU, P> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
for (int i = 0; i < batch_size; i++) {
for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
for (int32_t g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
......@@ -98,8 +107,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
......@@ -109,9 +118,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true, biase_data);
math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
beta, true, bias_data);
}
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvAddReluInt8Compute(const FusionConvAddReluInt8Param<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor scale = *param.InputScale();
int32_t axis = param.Axis();
Tensor *output = param.Output();
output->mutable_data<P>();
int32_t *bias_data = bias.data<int32_t>();
float scale_v = scale.data<float>()[0];
int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
std::vector<int32_t> dilations = param.Dilations();
const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<P>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int32_t>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, P> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int32_t g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul(filter_slice, false, col_matrix, false, scale_v, &out_slice,
static_cast<float>(0), true, bias_data);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
......@@ -106,16 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
if (param.Input()->type() == typeid(int8_t)) {
math::matmul(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0),
false, static_cast<int32_t *>(nullptr));
} else {
math::matmul(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0),
false, static_cast<float *>(nullptr));
}
math::matmul(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0),
false, static_cast<Otype *>(nullptr));
}
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvAddReluInt8Kernel
: public OpKernelBase<DeviceType, FusionConvAddReluInt8Param<DeviceType>> {
public:
void Compute(const FusionConvAddReluInt8Param<DeviceType> &param);
bool Init(FusionConvAddReluInt8Param<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
......@@ -2924,6 +2924,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __ARM_NEON
// 32位 float 矩阵乘法
template <>
void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) {
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "common/log.h"
#include "memory/t_malloc.h"
// 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)]
......@@ -163,11 +164,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
......@@ -201,11 +197,13 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
int32_t ldc);
// 8 bits int inner product
template <typename Otype>
void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int32_t *C,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu);
template <typename Otype>
void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int8_t *C,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
// 8 bits int pack function
......@@ -229,12 +227,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const int8_t *B, int32_t ldb, int8_t *buffer);
// 8 bits int matrix product
template <typename Itype, typename Btype, typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
template <typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, int32_t *C,
int32_t ldc, bool relu, int32_t *bias);
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, int8_t *C,
int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
int32_t *C, int32_t ldc, bool relu, int32_t *bias);
......@@ -266,6 +267,71 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
int8_t *zero_int8;
};
// 8 bits int matrix product (m*k x k*n)
template <typename Otype>
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -14,7 +14,6 @@ limitations under the License. */
#include <string.h>
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#if __ARM_NEON
#include <arm_neon.h>
......@@ -670,6 +669,11 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
}
// 8 bits int inner product
template <>
void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int8_t *C,
int32_t ldc, bool relu) {}
template <>
void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int32_t *C,
int32_t ldc, bool relu) {
......@@ -691,6 +695,7 @@ void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
}
}
template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int8_t *C, int32_t ldc, bool relu,
......@@ -715,6 +720,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
}
}
template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int32_t *bias) {}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
......@@ -1083,128 +1094,6 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
}
}
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
int32_t *C, int32_t ldc, bool relu, int32_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
int8_t *C, int32_t ldc, bool relu, int32_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias != nullptr) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int write back
// C = A * B
void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
......
......@@ -1705,36 +1705,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
FusionConvAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
};
#endif
: FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
#ifdef FUSION_CONVADDRELU_INT8_OP
template <typename Dtype>
class FusionConvAddReluInt8Param : public ConvParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionConvAddReluInt8Param(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: ConvParam<Dtype>(inputs, outputs, attrs, scope) {
scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
bias_ = OpParam::InputYFrom<GType>(inputs, scope);
axis_ = OpParam::GetAttr<int>("axis", attrs);
#endif
}
#ifdef FUSION_CONVADDRELU_INT8_OP
typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
const RType *InputScale() const { return scale_; }
RType *Bias() const { return bias_; }
const int &Axis() const { return axis_; }
protected:
RType *scale_;
RType *bias_;
int axis_;
#endif
};
#endif
......
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <climits>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <limits>
#include <random>
#include "../test_helper.h"
#include "common/log.h"
......@@ -57,10 +57,10 @@ void print_matirx(int m, int n, int ldc, int8_t *c) {
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > INT_MAX)
return INT_MAX;
else if (res < INT_MIN)
return INT_MIN;
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
......
......@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <limits>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/fusion_conv_add_relu_int8_op.h"
......@@ -19,10 +23,10 @@ limitations under the License. */
namespace paddle_mobile {
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > INT_MAX)
return INT_MAX;
else if (res < INT_MIN)
return INT_MIN;
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
......@@ -217,8 +221,8 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
inputs["Input"] = std::vector<std::string>({"input"});
inputs["Filter"] = std::vector<std::string>({"filter"});
inputs["Scale"] = std::vector<std::string>({"scale"});
inputs["Y"] = std::vector<std::string>({"y"});
outputs["Output"] = std::vector<std::string>({"output"});
inputs["Y"] = std::vector<std::string>({"bias"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
......@@ -234,7 +238,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
float scale_v = 0.000828f;
scale->mutable_data<float>()[0] = scale_v;
auto bias_var = scope.get()->Var("y");
auto bias_var = scope.get()->Var("bias");
auto bias = bias_var->template GetMutable<framework::LoDTensor>();
SetupTensor<int32_t>(bias, bias_shape, -127, 127);
......@@ -352,3 +356,5 @@ int main(int argc, char *argv[]) {
paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
out_channels);
}
#endif
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册