提交 24da5639 编写于 作者: Z Zhen Wang 提交者: GitHub

Merge pull request #1144 from wzzju/add-int8-gemm

Add int8 gemm
......@@ -32,7 +32,7 @@ template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs";
DLOG << type_ << " has no inputs";
return {};
}
return it->second.first;
......
......@@ -338,10 +338,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
for (int i = 0; i < tensor.numel(); i += stride) {
if (tensor.type() == typeid(float)) {
printer << tensor.data<float>()[i] << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
} else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) {
printer << tensor.data<int8_t>()[i] << " ";
printer << static_cast<int32_t>(tensor.data<int8_t>()[i]) << " ";
}
}
#endif
......
......@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
param.Out()->set_lod(param.InputX()->lod());
}
template class MulKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *out = param.Out();
out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
......@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>();
math::matmul<int8_t>(x_matrix, false, y_matrix, false,
static_cast<int8_t>(1), out, static_cast<int8_t>(0));
} else {
out->mutable_data<float>();
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
}
if (out_dim.size() != 2) {
out->Resize(out_dim);
}
}
template class MulKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
b_ptr = b;
int kc1 = k / 8;
int kc2 = k % 8;
int step = 4 * ldc;
int step = sizeof(float) * ldc;
asm volatile(
"pld [%[a_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t"
......@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [step] "r"(step)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else
#endif // __ARM_NEON
}
......
......@@ -96,6 +96,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
......@@ -139,6 +140,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1);
/*
// 向量矩阵乘法结果回写
// C = A * B
......@@ -185,15 +187,63 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// 8 bits function cluster begins
// 8 bits int small block inner product
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
// 8 bits int inner product
void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
const int8_t *a, const int8_t *b, int8_t beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int8_t *bias);
// 8 bits int pack function
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
// 8 bits int matrix product
void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
int32_t ldc, bool relu, int8_t *bias);
// 8 bits int write back
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + C
void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias
void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias, relu(C)
void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
private:
int MC = 0;
int KC = 0;
int NC = 0;
// 32位 float
float *packedA;
float *packedB;
float *packedC;
float *zero;
// 8 bits int
int8_t *packedA_int8;
int8_t *packedB_int8;
int32_t *packedC_int8;
int8_t *zero_int8;
};
} // namespace math
......
此差异已折叠。
......@@ -135,7 +135,7 @@ template <typename T>
struct ClearTensor<CPU, T> {
void operator()(framework::Tensor *tensor) {
auto size = tensor->numel();
auto *tensor_data = tensor->data<float>();
auto *tensor_data = tensor->data<T>();
memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT
}
};
......@@ -151,9 +151,9 @@ struct RowwiseAdd<CPU, T> {
PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
"output->dims() must be equal to in_dims.");
auto *input_data = input.data<float>();
auto *out_data = output->data<float>();
auto *vec_data = vector.data<float>();
auto *input_data = input.data<T>();
auto *out_data = output->data<T>();
auto *vec_data = vector.data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) {
out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
......
......@@ -25,7 +25,7 @@ template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
float *bias = nullptr);
T *bias = nullptr);
template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include <string>
#include "operators/math/gemm.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
bool relu, int8_t *bias) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
int32_t M = dim_out[0];
int32_t N = dim_out[1];
int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) {
int32_t numel = matrix_a.numel();
int32_t m = matrix_a.dims()[0];
int32_t n = matrix_a.dims()[1];
int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>()); // NOLINT
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
int32_t index = 0;
for (int32_t j = 0; j < n; j++) {
for (int32_t i = 0; i < m; i++) {
a[index++] = tmp[i * n + j];
}
}
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
relu, bias);
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -266,6 +266,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
target_link_libraries(test-gemm-accuracy paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
target_link_libraries(test-gemm-perf paddle-mobile)
......
......@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
}
paddle_mobile::operators::math::Gemm gemm;
gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr);
int eq = 0;
int neq = 0;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <random>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
using std::default_random_engine;
using std::uniform_int_distribution;
void print_matirx(int m, int n, int ldc, int32_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void print_matirx(int m, int n, int ldc, int8_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << static_cast<int32_t>(c(i, 0));
for (int j = 1; j < n; ++j) {
std::cout << " | " << static_cast<int32_t>(c(i, j));
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int do_sgemm(int m, int n, int k, bool relu, int pr) {
int lda = k;
int ldb = n;
int ldc = n;
default_random_engine e;
uniform_int_distribution<int8_t> pixel(-127, 127);
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
int8_t *b = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
int32_t *c = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
int32_t *c1 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
for (int i = 0; i < m * k; ++i) {
a[i] = pixel(e);
}
for (int i = 0; i < k * n; ++i) {
b[i] = pixel(e);
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
c1(i, j) = r;
}
}
paddle_mobile::operators::math::Gemm gemm;
gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
static_cast<int8_t>(0), c, ldc, relu, nullptr);
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) {
if (c[i] == c1[i]) {
++eq;
} else {
++neq;
}
}
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
return 0;
}
int main() {
do_sgemm(9, 9, 9, false, 10);
do_sgemm(10, 6, 12, false, 0);
do_sgemm(512, 256, 384, false, 0);
do_sgemm(1366, 768, 256, false, 0);
do_sgemm(1255, 755, 333, false, 0);
do_sgemm(555, 777, 999, false, 0);
do_sgemm(1024, 1024, 1024, false, 0);
return 0;
}
......@@ -28,13 +28,11 @@ limitations under the License. */
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
Tensor aa, bb, cc, scale, bias;
paddle_mobile.SetThreadNum(1);
Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n});
auto ccptr = cc.mutable_data<float>({m, n});
auto scaleptr = scale.mutable_data<float>({m});
auto biasptr = bias.mutable_data<float>({m});
for (int i = 0; i < m * k; ++i) {
aaptr[i] = 2;
......@@ -45,23 +43,55 @@ int main() {
for (int i = 0; i < m * n; ++i) {
ccptr[i] = 2;
}
for (int i = 0; i < m; ++i) {
scaleptr[i] = 1;
biasptr[i] = 0;
Tensor aa_int8, bb_int8, cc_int8;
auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < k * n; ++i) {
bbptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < m * n; ++i) {
ccptr_int8[i] = static_cast<int32_t>(2);
}
auto time1 = time();
// float
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, biasptr);
false, nullptr);
}
// paddle_mobile::operators::math::matmulWithBn<float>(
// aa, false, bb, false, static_cast<float>(1), &cc,
// static_cast<float>(0), true, &scale, &bias, 0);
auto time1 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
}
auto time2 = time();
std::cout << "gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
// int8_t
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
return 0;
}
......@@ -12,80 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(g_resnet);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
executor(program, "mul");
// 1. input_tensors;
vector<Tensor> input_tensors;
Tensor input1;
auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
input_tensors.push_back(input1);
Tensor input2;
auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
input_tensors.push_back(input2);
// 2. input_names
vector<string> input_names({
"pool2d_0.tmp_0",
"fc_0.w_0",
});
// 3. output_names
vector<string> output_names({"fc_0.tmp_0"});
// 4. out_dims;
vector<DDim> out_ddims;
auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
out_ddims.push_back(out_ddim);
auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
output_names, out_ddims);
auto output0_data = output[0]->data<float>();
auto dim_1 = input1.numel() / input1.dims()[0];
DLOG << " input1 : ";
for (int i = 0; i < input1.dims()[0]; ++i) {
for (int j = 0; j < dim_1; ++j) {
DLOGF("%f ", input1_data[i * dim_1 + j]);
}
DLOGF("\n");
}
auto dim_2 = input2.numel() / input2.dims()[0];
DLOG << " input2 : ";
for (int i = 0; i < input2.dims()[0]; ++i) {
for (int j = 0; j < dim_2; ++j) {
DLOGF("%f ", input2_data[i * dim_2 + j]);
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
namespace paddle_mobile {
using framework::AttributeMap;
using framework::DDim;
using framework::Scope;
using framework::make_ddim;
template <typename I, typename O>
int TestMulOP() {
int32_t m = 1024;
int32_t n = 1024;
int32_t k = 1024;
int32_t lda = k;
int32_t ldb = n;
int32_t ldc = n;
DDim inputA_shape = make_ddim({m, k});
DDim inputB_shape = make_ddim({k, n});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<Scope>();
inputs["X"] = std::vector<std::string>({"inputA"});
inputs["Y"] = std::vector<std::string>({"inputB"});
outputs["Out"] = std::vector<std::string>({"output"});
auto inputA_var = scope.get()->Var("inputA");
auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<I>(inputA, inputA_shape, -127, 127);
auto inputB_var = scope.get()->Var("inputB");
auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
SetupTensor<I>(inputB, inputB_shape, -127, 127);
auto output_var = scope.get()->Var("output");
AttributeMap attrs;
attrs["x_num_col_dims"].Set<int>(1);
attrs["y_num_col_dims"].Set<int>(1);
auto *op =
new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs, scope);
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const O *output_data = output->data<O>();
// compare
O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
I *a = inputA->data<I>();
I *b = inputB->data<I>();
for (int32_t i = 0; i < m; ++i) {
for (int32_t j = 0; j < n; ++j) {
O r = 0;
for (int32_t p = 0; p < k; p++) {
r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
}
c(i, j) = r;
}
DLOGF("\n");
}
auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
DLOG << " output : ";
for (int i = 0; i < output[0]->dims()[0]; ++i) {
for (int j = 0; j < dim_output0; ++j) {
DLOGF("%f ", output0_data[i * dim_2 + j]);
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
}
DLOGF("\n");
}
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq;
delete op;
return 0;
}
} // namespace paddle_mobile
/// output (3,3)
DLOG << "output memory size : " << output[0]->memory_size();
DLOG << "output numel : " << output[0]->numel();
DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
<< " x " << input2_data[0 + 3] << " = " << output0_data[0];
int main() {
paddle_mobile::TestMulOP<int8_t, int32_t>();
paddle_mobile::TestMulOP<float, float>();
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册