提交 74a309cb 编写于 作者: S StarryRain 提交者: Yanzhan Yang

add CPU_ARCH info, improve the performance of GEMM1*1s1 (#1751)

上级 497bf326
......@@ -145,6 +145,18 @@ struct PaddleMobileConfigInternal {
std::string model_obfuscate_key = "";
};
enum ARMArch {
APPLE = 0,
A53 = 53,
A55 = 55,
A57 = 57,
A72 = 72,
A73 = 73,
A75 = 75,
A76 = 76,
ARM_UNKOWN = -1
};
extern const char *G_OP_TYPE_CONV;
extern const char *G_OP_TYPE_BATCHNORM;
extern const char *G_OP_TYPE_BOX_CODER;
......
......@@ -261,7 +261,8 @@ int set_sched_affinity(const std::vector<int> &cpu_ids) {
return 0;
}
int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
int get_cpu_info_by_name(int *cpu_num, ARMArch *arch,
std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids,
std::vector<int> *l1_cache_sizes,
std::vector<int> *l2_cache_sizes,
......@@ -270,6 +271,7 @@ int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
/* Snapdragon */
if (hardware_name.find("SDM845") != std::string::npos) { // 845
*cpu_num = 8;
*arch = A75;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
l1_cache_sizes->resize(*cpu_num);
......@@ -282,6 +284,7 @@ int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
return 0;
} else if (hardware_name.find("SDM710") != std::string::npos) { // 710
*cpu_num = 8;
*arch = A75;
*big_core_ids = {6, 7};
*little_core_ids = {0, 1, 2, 3, 4, 5};
l1_cache_sizes->resize(*cpu_num);
......@@ -295,6 +298,7 @@ int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
return 0;
} else if (hardware_name.find("MSM8998") != std::string::npos) { // 835
*cpu_num = 8;
*arch = A73;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
l1_cache_sizes->resize(*cpu_num);
......@@ -313,8 +317,9 @@ int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
return 0;
} else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653
*cpu_num = 8;
*big_core_ids = {0, 1, 2, 3, 4, 5, 6, 7};
*little_core_ids = {};
*arch = A72;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
l1_cache_sizes->resize(*cpu_num);
l2_cache_sizes->resize(*cpu_num);
l3_cache_sizes->resize(*cpu_num);
......@@ -322,6 +327,42 @@ int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
fill_cpu_cache_size(l3_cache_sizes, 0);
return 0;
} else if (hardware_name.find("SDM660") != std::string::npos ||
hardware_name.find("SDM636") != std::string::npos) { // 660, 636
*cpu_num = 8;
*arch = A73;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
l1_cache_sizes->resize(*cpu_num);
l2_cache_sizes->resize(*cpu_num);
l3_cache_sizes->resize(*cpu_num);
fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
fill_cpu_cache_size(l3_cache_sizes, 0);
return 0;
/* MediaTek */
} else if (hardware_name.find("MT6799") != std::string::npos) { // X30
*cpu_num = 10;
*arch = A73;
*big_core_ids = {8, 9};
*little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7};
return 0;
} else if (hardware_name.find("MT6771") != std::string::npos) { // P60
*cpu_num = 8;
*arch = A73;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
return 0;
/* Kirin */
} else if (hardware_name.find("KIRIN970") !=
std::string::npos) { // Kirin 970
*cpu_num = 8;
*arch = A73;
*big_core_ids = {4, 5, 6, 7};
*little_core_ids = {0, 1, 2, 3};
return 0;
}
return -1;
}
......@@ -410,7 +451,7 @@ CPUContext::CPUContext() {
// probe cpu info, and set big&litte clusters, L1, L2 and L3 cache sizes
std::string cpu_name = get_cpu_name();
bool failed =
get_cpu_info_by_name(&_cpu_num, &_big_core_ids, &_little_core_ids,
get_cpu_info_by_name(&_cpu_num, &_arch, &_big_core_ids, &_little_core_ids,
&_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes,
cpu_name) != 0;
if (failed) {
......
......@@ -43,12 +43,14 @@ struct CPUContext {
int get_thread_num();
PowerMode get_power_mode() const { return _power_mode; }
int get_cache_size(int level);
ARMArch get_arch() const { return _arch; }
int get_l1_cache_size() { return get_cache_size(1); }
int get_l2_cache_size() { return get_cache_size(2); }
int get_l3_cache_size() { return get_cache_size(3); }
void* get_work_space(int size_in_byte);
int _cpu_num;
ARMArch _arch;
PowerMode _power_mode;
std::vector<int> _big_core_ids;
std::vector<int> _little_core_ids;
......
......@@ -126,6 +126,9 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -44,6 +44,9 @@ void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -45,6 +45,9 @@ void ConvAddReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -64,6 +64,9 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -77,6 +77,9 @@ void ConvBNReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/arm/convolution/conv_common.h"
#include "framework/context.h"
#include "operators/math/gemm/gemm1x1s1.h"
#include "operators/math/slidingwindow_utils.h"
#include "operators/math/winograd/winograd_transform.h"
......@@ -20,6 +22,8 @@ namespace paddle_mobile {
namespace operators {
void InitBaseConvKernel(ConvParam<CPU> *param) {
bool conv1x1 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == 1;
bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == 3;
bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
......@@ -83,6 +87,22 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
math::slidingwindow_transform_weight<float>(*param->Filter(),
param->transformed_filter_);
param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT;
} else if (conv1x1 && param->Groups() == 1 &&
param->Paddings()[0] == param->Paddings()[1] &&
param->Paddings()[0] == 0 && param->Input()->dims()[1] > 1 &&
param->Strides()[0] == param->Strides()[1] &&
param->Dilations()[0] == param->Dilations()[1] &&
param->Strides()[0] == 1 && param->Dilations()[0] == 1 &&
param->Output()->dims()[2] * param->Output()->dims()[3] > 1) {
// transform weight
Variable *transformed_var = param->GetScope()->Var();
ARMArch arch = framework::CPUContext::Context()->get_arch();
param->transformed_filter_ =
transformed_var->GetMutable<framework::LoDTensor>();
math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
param->transformed_filter_,
param->groups, arch);
param->ExecMode() = ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT;
} else {
param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
}
......
......@@ -54,6 +54,9 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -45,6 +45,9 @@ void ConvReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
SlidingwindowConv3x3<float, float>(param);
......
......@@ -76,6 +76,9 @@ void DWConvBNReluKernel<CPU, float>::Compute(
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
GemmConv<float, float>(param);
break;
case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
GemmConv1x1s1<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -14,9 +14,11 @@ limitations under the License. */
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include <vector>
#include "framework/context.h"
#include "operators/math/depthwise/faster_depthwise_conv3x3.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/gemm/gemm1x1s1.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
......@@ -137,6 +139,61 @@ void GemmConv(const ConvParam<CPU> &param) {
}
}
template <typename Itype, typename Otype>
void GemmConv1x1s1(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.transformed_filter_;
Tensor *output = param.Output();
output->mutable_data<Otype>();
const float *din = input->data<Itype>();
float *dout = output->mutable_data<Otype>();
const int num = input->dims()[0];
const int chin = input->dims()[1];
const int hin = input->dims()[2];
const int win = input->dims()[3];
const int chout = output->dims()[1];
const int hout = output->dims()[2];
const int wout = output->dims()[3];
const float *weights = filter.mutable_data<float>();
const float *bias = nullptr;
int channel_size_out = wout * hout;
int channel_size_in = win * hin;
const int group = param.Groups();
const int m = chout / group;
const int n = hout * wout;
const int k = chin / group;
bool flag_relu = false;
bool flag_bias = false;
ARMArch arch = framework::CPUContext::Context()->get_arch();
int hblock = math::get_hblock(arch);
int m_roundup = hblock * ((m + hblock - 1) / hblock);
int weights_size_per_group = m * k;
if (n > 1) {
weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
}
for (int b = 0; b < num; ++b) {
// dC
for (int g = 0; g < group; ++g) {
float *dout_group =
static_cast<float *>(dout) + (b * chout + g * m) * channel_size_out;
const float *din_group = static_cast<const float *>(din) +
(b * chin + g * k) * channel_size_in;
const float *weights_group =
static_cast<const float *>(weights) + g * weights_size_per_group;
const float *bias_group = static_cast<const float *>(bias) + g * m;
if (n > 1) {
math::sgemm_prepack(weights_group, din_group, bias_group, dout_group, m,
n, k, flag_bias, flag_relu, false, arch);
}
}
}
}
template <int tile, int kernel>
void WinogradConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
......@@ -293,6 +350,7 @@ void SlidingwindowConv3x3(const ConvParam<CPU> &param) {
}
template void GemmConv<float, float>(const ConvParam<CPU> &param);
template void GemmConv1x1s1<float, float>(const ConvParam<CPU> &param);
template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
......
......@@ -32,6 +32,9 @@ bool IsExpand(const std::vector<int64_t> &filter_dim,
template <typename Itype, typename Otype>
void GemmConv(const ConvParam<CPU> &param);
template <typename Itype, typename Otype>
void GemmConv1x1s1(const ConvParam<CPU> &param);
template <int tile, int kernel>
void WinogradConv3x3(const ConvParam<CPU> &param);
......
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
#ifdef __aarch64__
const int MBLOCK = 8;
const int NBLOCK = 12;
const int KBLOCK = 4;
inline int get_hblock(ARMArch arch) { return MBLOCK; }
#else
const int MBLOCK_A73 = 4;
const int MBLOCK_OTH = 6;
const int NBLOCK = 8;
const int KBLOCK = 4;
inline int get_hblock(ARMArch arch) {
if (arch == A73) {
return MBLOCK_A73;
} else {
return MBLOCK_OTH;
}
}
#endif // __aarch64__
void gemm1x1s1_transform_weight(const framework::Tensor& weight,
const framework::Tensor& output,
framework::Tensor* trans_weight,
const int group, ARMArch arch);
void sgemm_prepack(const float* A_packed, const float* B, const float* bias,
float* C, int M, int N, int K, bool is_bias, bool is_relu,
bool is_transB, ARMArch arch);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif // CONV_OP
......@@ -467,6 +467,7 @@ class ConvParam : public OpParam {
EXEC_SLIDINGWINDOW3x3_FLOAT,
EXEC_SLIDINGWINDOW5x5_FLOAT,
EXEC_SLIDINGWINDOW7x7_FLOAT,
EXEC_GEMM1x1s1_FLOAT,
};
ExecMode &ExecMode() const { return exec_mode_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册