提交 24da5639 编写于 作者: Z Zhen Wang 提交者: GitHub

Merge pull request #1144 from wzzju/add-int8-gemm

Add int8 gemm
...@@ -32,7 +32,7 @@ template <typename Dtype> ...@@ -32,7 +32,7 @@ template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const { vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_); auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) { if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs"; DLOG << type_ << " has no inputs";
return {}; return {};
} }
return it->second.first; return it->second.first;
......
...@@ -338,10 +338,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { ...@@ -338,10 +338,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
for (int i = 0; i < tensor.numel(); i += stride) { for (int i = 0; i < tensor.numel(); i += stride) {
if (tensor.type() == typeid(float)) { if (tensor.type() == typeid(float)) {
printer << tensor.data<float>()[i] << " "; printer << tensor.data<float>()[i] << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
} else if (tensor.type() == typeid(int64_t)) { } else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " "; printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) { } else if (tensor.type() == typeid(int8_t)) {
printer << tensor.data<int8_t>()[i] << " "; printer << static_cast<int32_t>(tensor.data<int8_t>()[i]) << " ";
} }
} }
#endif #endif
......
...@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const { ...@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
param.Out()->set_lod(param.InputX()->lod()); param.Out()->set_lod(param.InputX()->lod());
} }
template class MulKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY(); const Tensor *input_y = param.InputY();
Tensor *out = param.Out(); Tensor *out = param.Out();
out->mutable_data<float>();
const Tensor x_matrix = const Tensor x_matrix =
input_x->dims().size() > 2 input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
...@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), if (param.InputX()->type() == typeid(int8_t)) {
out, static_cast<float>(0)); out->mutable_data<int32_t>();
math::matmul<int8_t>(x_matrix, false, y_matrix, false,
static_cast<int8_t>(1), out, static_cast<int8_t>(0));
} else {
out->mutable_data<float>();
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
}
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize(out_dim); out->Resize(out_dim);
} }
} }
template class MulKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
b_ptr = b; b_ptr = b;
int kc1 = k / 8; int kc1 = k / 8;
int kc2 = k % 8; int kc2 = k % 8;
int step = 4 * ldc; int step = sizeof(float) * ldc;
asm volatile( asm volatile(
"pld [%[a_ptr]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t" "pld [%[a_ptr], #64] \n\t"
...@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
: :
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [step] "r"(step) [kc2] "r"(kc2), [step] "r"(step)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__ #endif // __aarch64__
#else
#endif // __ARM_NEON #endif // __ARM_NEON
} }
......
...@@ -96,6 +96,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -96,6 +96,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
/* /*
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
...@@ -139,6 +140,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -139,6 +140,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1); float *new_scale, float *new_bias, float *bias1);
/* /*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
...@@ -185,15 +187,63 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -185,15 +187,63 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// 8 bits function cluster begins
// 8 bits int small block inner product
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
// 8 bits int inner product
void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
const int8_t *a, const int8_t *b, int8_t beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int8_t *bias);
// 8 bits int pack function
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
// 8 bits int matrix product
void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
int32_t ldc, bool relu, int8_t *bias);
// 8 bits int write back
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + C
void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias
void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias, relu(C)
void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
private: private:
int MC = 0; int MC = 0;
int KC = 0; int KC = 0;
int NC = 0; int NC = 0;
// 32位 float
float *packedA; float *packedA;
float *packedB; float *packedB;
float *packedC; float *packedC;
float *zero; float *zero;
// 8 bits int
int8_t *packedA_int8;
int8_t *packedB_int8;
int32_t *packedC_int8;
int8_t *zero_int8;
}; };
} // namespace math } // namespace math
......
此差异已折叠。
...@@ -135,7 +135,7 @@ template <typename T> ...@@ -135,7 +135,7 @@ template <typename T>
struct ClearTensor<CPU, T> { struct ClearTensor<CPU, T> {
void operator()(framework::Tensor *tensor) { void operator()(framework::Tensor *tensor) {
auto size = tensor->numel(); auto size = tensor->numel();
auto *tensor_data = tensor->data<float>(); auto *tensor_data = tensor->data<T>();
memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT
} }
}; };
...@@ -151,9 +151,9 @@ struct RowwiseAdd<CPU, T> { ...@@ -151,9 +151,9 @@ struct RowwiseAdd<CPU, T> {
PADDLE_MOBILE_ENFORCE((output->dims() == in_dims), PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
"output->dims() must be equal to in_dims."); "output->dims() must be equal to in_dims.");
auto *input_data = input.data<float>(); auto *input_data = input.data<T>();
auto *out_data = output->data<float>(); auto *out_data = output->data<T>();
auto *vec_data = vector.data<float>(); auto *vec_data = vector.data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) { for (int64_t j = 0; j < size; ++j) {
out_data[i * size + j] = input_data[i * size + j] + vec_data[j]; out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
......
...@@ -25,7 +25,7 @@ template <typename T> ...@@ -25,7 +25,7 @@ template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false, framework::Tensor *matrix_out, T beta, bool relu = false,
float *bias = nullptr); T *bias = nullptr);
template <typename T> template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include <string>
#include "operators/math/gemm.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
bool relu, int8_t *bias) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
int32_t M = dim_out[0];
int32_t N = dim_out[1];
int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) {
int32_t numel = matrix_a.numel();
int32_t m = matrix_a.dims()[0];
int32_t n = matrix_a.dims()[1];
int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>()); // NOLINT
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
int32_t index = 0;
for (int32_t j = 0; j < n; j++) {
for (int32_t i = 0; i < m; i++) {
a[index++] = tmp[i * n + j];
}
}
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
relu, bias);
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
...@@ -266,6 +266,10 @@ if (NOT FOUND_MATCH) ...@@ -266,6 +266,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
target_link_libraries(test-gemm-accuracy paddle-mobile) target_link_libraries(test-gemm-accuracy paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp) ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
target_link_libraries(test-gemm-perf paddle-mobile) target_link_libraries(test-gemm-perf paddle-mobile)
......
...@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
paddle_mobile::operators::math::Gemm gemm; paddle_mobile::operators::math::Gemm gemm;
gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr); nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <random>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
using std::default_random_engine;
using std::uniform_int_distribution;
void print_matirx(int m, int n, int ldc, int32_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void print_matirx(int m, int n, int ldc, int8_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << static_cast<int32_t>(c(i, 0));
for (int j = 1; j < n; ++j) {
std::cout << " | " << static_cast<int32_t>(c(i, j));
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int do_sgemm(int m, int n, int k, bool relu, int pr) {
int lda = k;
int ldb = n;
int ldc = n;
default_random_engine e;
uniform_int_distribution<int8_t> pixel(-127, 127);
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
int8_t *b = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
int32_t *c = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
int32_t *c1 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
for (int i = 0; i < m * k; ++i) {
a[i] = pixel(e);
}
for (int i = 0; i < k * n; ++i) {
b[i] = pixel(e);
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
c1(i, j) = r;
}
}
paddle_mobile::operators::math::Gemm gemm;
gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
static_cast<int8_t>(0), c, ldc, relu, nullptr);
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) {
if (c[i] == c1[i]) {
++eq;
} else {
++neq;
}
}
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
return 0;
}
int main() {
do_sgemm(9, 9, 9, false, 10);
do_sgemm(10, 6, 12, false, 0);
do_sgemm(512, 256, 384, false, 0);
do_sgemm(1366, 768, 256, false, 0);
do_sgemm(1255, 755, 333, false, 0);
do_sgemm(555, 777, 999, false, 0);
do_sgemm(1024, 1024, 1024, false, 0);
return 0;
}
...@@ -28,13 +28,11 @@ limitations under the License. */ ...@@ -28,13 +28,11 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4); paddle_mobile.SetThreadNum(1);
Tensor aa, bb, cc, scale, bias; Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k}); auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n}); auto bbptr = bb.mutable_data<float>({k, n});
auto ccptr = cc.mutable_data<float>({m, n}); auto ccptr = cc.mutable_data<float>({m, n});
auto scaleptr = scale.mutable_data<float>({m});
auto biasptr = bias.mutable_data<float>({m});
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
aaptr[i] = 2; aaptr[i] = 2;
...@@ -45,23 +43,55 @@ int main() { ...@@ -45,23 +43,55 @@ int main() {
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
ccptr[i] = 2; ccptr[i] = 2;
} }
for (int i = 0; i < m; ++i) {
scaleptr[i] = 1; Tensor aa_int8, bb_int8, cc_int8;
biasptr[i] = 0; auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < k * n; ++i) {
bbptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < m * n; ++i) {
ccptr_int8[i] = static_cast<int32_t>(2);
} }
auto time1 = time(); // float
// warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>( paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, biasptr); false, nullptr);
}
// paddle_mobile::operators::math::matmulWithBn<float>( auto time1 = time();
// aa, false, bb, false, static_cast<float>(1), &cc, for (int j = 0; j < 10; ++j) {
// static_cast<float>(0), true, &scale, &bias, 0); paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
} }
auto time2 = time(); auto time2 = time();
std::cout << "gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
// int8_t
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
return 0; return 0;
} }
...@@ -12,80 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,80 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
int main() { #define a(i, j) a[(i)*lda + (j)]
paddle_mobile::Loader<paddle_mobile::CPU> loader; #define b(i, j) b[(i)*ldb + (j)]
auto program = loader.Load(g_resnet); #define c(i, j) c[(i)*ldc + (j)]
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail"); namespace paddle_mobile {
using framework::AttributeMap;
Executor4Test<paddle_mobile::CPU, using framework::DDim;
paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>> using framework::Scope;
executor(program, "mul"); using framework::make_ddim;
template <typename I, typename O>
// 1. input_tensors; int TestMulOP() {
vector<Tensor> input_tensors; int32_t m = 1024;
int32_t n = 1024;
Tensor input1; int32_t k = 1024;
auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1); int32_t lda = k;
input_tensors.push_back(input1); int32_t ldb = n;
Tensor input2; int32_t ldc = n;
auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1); DDim inputA_shape = make_ddim({m, k});
input_tensors.push_back(input2); DDim inputB_shape = make_ddim({k, n});
VariableNameMap inputs;
// 2. input_names VariableNameMap outputs;
vector<string> input_names({ auto scope = std::make_shared<Scope>();
"pool2d_0.tmp_0", inputs["X"] = std::vector<std::string>({"inputA"});
"fc_0.w_0", inputs["Y"] = std::vector<std::string>({"inputB"});
}); outputs["Out"] = std::vector<std::string>({"output"});
// 3. output_names auto inputA_var = scope.get()->Var("inputA");
vector<string> output_names({"fc_0.tmp_0"}); auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<I>(inputA, inputA_shape, -127, 127);
// 4. out_dims; auto inputB_var = scope.get()->Var("inputB");
vector<DDim> out_ddims; auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
auto out_ddim = paddle_mobile::framework::make_ddim({3, 3}); SetupTensor<I>(inputB, inputB_shape, -127, 127);
out_ddims.push_back(out_ddim);
auto output_var = scope.get()->Var("output");
auto output = executor.Predict<LoDTensor>(input_tensors, input_names, AttributeMap attrs;
output_names, out_ddims); attrs["x_num_col_dims"].Set<int>(1);
attrs["y_num_col_dims"].Set<int>(1);
auto output0_data = output[0]->data<float>(); auto *op =
new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs, scope);
auto dim_1 = input1.numel() / input1.dims()[0]; op->InferShape();
DLOG << " input1 : "; op->Run();
for (int i = 0; i < input1.dims()[0]; ++i) { auto output = output_var->template Get<framework::LoDTensor>();
for (int j = 0; j < dim_1; ++j) { const O *output_data = output->data<O>();
DLOGF("%f ", input1_data[i * dim_1 + j]); // compare
} O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
DLOGF("\n"); I *a = inputA->data<I>();
} I *b = inputB->data<I>();
for (int32_t i = 0; i < m; ++i) {
auto dim_2 = input2.numel() / input2.dims()[0]; for (int32_t j = 0; j < n; ++j) {
DLOG << " input2 : "; O r = 0;
for (int i = 0; i < input2.dims()[0]; ++i) { for (int32_t p = 0; p < k; p++) {
for (int j = 0; j < dim_2; ++j) { r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
DLOGF("%f ", input2_data[i * dim_2 + j]); }
c(i, j) = r;
} }
DLOGF("\n");
} }
auto dim_output0 = output[0]->numel() / output[0]->dims()[0]; int32_t eq = 0;
DLOG << " output : "; int32_t neq = 0;
for (int i = 0; i < output[0]->dims()[0]; ++i) { for (int32_t i = 0; i < m * n; ++i) {
for (int j = 0; j < dim_output0; ++j) { PADDLE_MOBILE_ENFORCE(
DLOGF("%f ", output0_data[i * dim_2 + j]); output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
} }
DLOGF("\n");
} }
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq;
delete op;
return 0;
}
} // namespace paddle_mobile
/// output (3,3) int main() {
DLOG << "output memory size : " << output[0]->memory_size(); paddle_mobile::TestMulOP<int8_t, int32_t>();
DLOG << "output numel : " << output[0]->numel(); paddle_mobile::TestMulOP<float, float>();
DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
<< " x " << input2_data[0 + 3] << " = " << output0_data[0];
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册