提交 25261891 编写于 作者: Z ZhenWang

add int8_t type sgemm_omp

上级 e5e92bed
develop _release/v2.6.2 gh-pages github/fork/AnBaolei1984/baolei/bitmain github/fork/AnBaolei1984/bitmain github/fork/Cambricon/develop github/fork/DannyIsFunny/Android5 github/fork/DannyIsFunny/Update_MemOpt github/fork/DannyIsFunny/fix_pow github/fork/DannyIsFunny/fix_v26_windows github/fork/GaoWei8/reduce_sum github/fork/GaoWei8/reduce_sum_test_con github/fork/LDOUBLEV/ocr github/fork/Leonardo-Ding/dwh_dev github/fork/MaxwellDing/develop github/fork/MyPandaShaoxiang/fpga_patch github/fork/MyPandaShaoxiang/int8 github/fork/MyPandaShaoxiang/nlp_correct github/fork/MyPandaShaoxiang/opencl_valid github/fork/MyPandaShaoxiang/release/v2.3 github/fork/NHZlX/more_jeston_support github/fork/PaddleLite-EB/merge1.4 github/fork/PaddleLite-EB/new_dev github/fork/Wangzheee/matrix_nms_op github/fork/Xreki/step_rnn/opt_ddim_lite github/fork/cathwong/patch-1 github/fork/cclauss/patch-1 github/fork/chenjiaoAngel/cherry_pic github/fork/chenjiaoAngel/conv_dw_5x5 github/fork/chenjiaoAngel/conv_dw_5x5s2 github/fork/edimetia3d/arm_update_elementwise_op github/fork/edimetia3d/host_deformable_conv github/fork/edimetia3d/matrix_nms_host github/fork/edimetia3d/update_pow_op github/fork/edimetia3d/update_yolo_box github/fork/haozech/develop github/fork/haozech/infershape_chz github/fork/haozech/parl-develop github/fork/jackzhang235/develop github/fork/jameswu2014/develop github/fork/jiansowa/jiansowa/img_nna github/fork/jiweibo/stream_manage github/fork/juncaipeng/add_cast github/fork/lijianshe02/lite-x86 github/fork/qili93/update_sup_model_v26 github/fork/qjing666/develop github/fork/qnqinan/develop github/fork/qnqinan/track-develop github/fork/sangoly/python_compa github/fork/smilejames/develop github/fork/sunsetlh/sunsetlh/xpu_multi_test github/fork/wangqunbaidu/develop github/fork/weihaoji/whj_27 github/fork/weihaoji/xpu_res2net_fusion github/fork/weihaoji/xpu_weihaoji_dev github/fork/xiebaiyuan/fix_leak_opencl github/fork/xiebaiyuan/opencl_depthwised1 github/fork/xiebaiyuan/opencl_softmax github/fork/yanghongtian/yanghongtian/add_ascend310_target_place github/fork/yiicy/computelib github/fork/yongqiangma/bm_card github/fork/yongqiangma/calib github/fork/yongqiangma/copytocpu github/fork/yongqiangma/gpu github/fork/yongqiangma/pass github/fork/yongqiangma/pool github/fork/yongqiangma/priorbox github/fork/yongqiangma/shape github/fork/yongqiangma/split_c github/fork/yongqiangma/trans github/fork/yongqiangma/trans2 github/fork/yongqiangma/workspace github/fork/ysh329/add-cl-kernel-member-for-opencl github/fork/ysh329/add-get-output github/fork/ysh329/cherry-pick-precision-profiler-enhance github/fork/ysh329/fix-opencl-concat github/fork/ysh329/support-int64-copy-from-to-cpu github/fork/zhaoyang-star/enable_prifile_in_tiny_publish github/fork/zhaoyang-star/fix_openc_demo github/fork/zhaoyang-star/patch-1 github/fork/zhupengyang/opt release/v2.0.0 release/v2.0.0-beta1 release/v2.0.0-beta2 release/v2.0.0-rc release/v2.1.0 release/v2.2.0 release/v2.3 release/v2.6 release/v2.6.0 release/v2.7 revert-4368-hongming/test_v26 2.0.0-beta 1.5.0 v2.7-beta v2.6.3-beta2 v2.6.3-beta1 v2.6.2 v2.6.1 v2.6.0 v2.3.0 v2.2.0 v2.1.0 v2.0.0 v2.0.0-rc v2.0.0-beta1 v2.0.0-beta1-prerel release/1.4
5 合并请求!3489pull code,!3210[Opencl] fix opencl bug,!3154[arm]resize nnv12 bug,!3074[opencl]add grid_sampler op,!1334Add pooling int8
......@@ -3147,6 +3147,7 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
// 32位 float 矩阵乘法
template <>
void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) {
......@@ -16,6 +16,9 @@ limitations under the License. */
#include <string>
#include "common/log.h"
#include "memory/t_malloc.h"
#ifdef _OPENMP
#include <omp.h>
// 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)]
......@@ -172,11 +175,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
......@@ -228,6 +226,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits int matrix product
template <typename Itype, typename Btype, typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
template <typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias);
template <typename Itype, typename Btype, typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias);
......@@ -235,10 +241,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, int32_t *bias);
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
int32_t *C, int32_t ldc, bool relu, int32_t *bias);
// 8 bits int write back
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
......@@ -332,6 +334,131 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
// 8 bits int matrix product (m*k x k*n), omp version
template <typename Otype>
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
float beta, Otype *C, int32_t ldc, bool relu,
int32_t *bias) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
int32_t max_threads = 1;
int32_t L1 = 64 / max_threads * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
// 补齐 B
NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO()
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO()
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
int32_t local_threads = 0;
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
if (bias == nullptr) {
InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu);
} else {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu, bias + i);
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
int32_t local_threads = 0;
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
if (bias == nullptr) {
InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu);
} else {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu, bias);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -27,130 +27,6 @@ namespace paddle_mobile {
namespace operators {
namespace math {
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
float beta, int32_t *C, int32_t ldc, bool relu,
int32_t *bias) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
int32_t max_threads = 1;
int32_t L1 = 64 / max_threads * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
// 补齐 B
NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
int32_t local_threads = 0;
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
// InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
// local_C,
// &C(i, 0), ldc, relu, bias + i);
if (bias == nullptr) {
InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu);
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
int32_t local_threads = 0;
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
// InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
// local_C,
// &C(0, j), ldc, relu, bias);
if (bias == nullptr) {
InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu);
void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
......@@ -54,9 +54,8 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#ifdef _OPENMP
if (bias != nullptr) {
// TODO(wzzju):gemm.Sgemm_omp_with_bias, now use single thread instead.
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
} else {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
......@@ -73,10 +72,9 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
} else {
#ifdef _OPENMP
if (bias != nullptr) {
// TODO(wzzju):gemm.Sgemm_omp_with_bias, now use single thread instead.
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
N, relu, bias);
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias);
} else {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
......@@ -201,9 +201,8 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
paddle_mobile::operators::math::Gemm gemm;
#ifdef _OPENMP
// TODO(wzzju):gemm.Sgemm_omp_with_bias, now use single thread instead.
gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias);
......@@ -95,7 +95,7 @@ int TestMulOP() {
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile::TestMulOP<int8_t, int32_t>();
paddle_mobile::TestMulOP<float, float>();
return 0;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册
客服 返回