diff --git a/mace/core/BUILD b/mace/core/BUILD index f084ef6248b2daf4768ba30b07e404620b87b785..218fd1bd2354a9936d243764c9cdb113389ba817 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -23,7 +23,7 @@ cc_library( # Main program for tests cc_library( - name = "test_main", + name = "test_benchmark_main", testonly = 1, srcs = glob([ "testing/*.cc", diff --git a/mace/core/common.h b/mace/core/common.h index ae295d3d0aacdabaf60503d9686c7b1a0344b6cc..eda7e5c4f9933ff56a107e8087e141ae13933253 100644 --- a/mace/core/common.h +++ b/mace/core/common.h @@ -33,4 +33,6 @@ private: \ #define MACE_NOT_IMPLEMENTED REQUIRE(false, "not implemented") +#define kCostPerGroup 8192 + #endif // MACE_CORE_COMMON_H_ diff --git a/mace/core/testing/test_main.cc b/mace/core/testing/test_benchmark_main.cc similarity index 100% rename from mace/core/testing/test_main.cc rename to mace/core/testing/test_benchmark_main.cc diff --git a/mace/examples/BUILD b/mace/examples/BUILD index bd591c29e39aa93e11ccc50feffdea61efa6c469..4f4a7794e0cb00f7d8312299dd7572afd74e68d6 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -28,6 +28,6 @@ cc_test( linkstatic = 1, deps = [ "//mace/core", - "//mace/core:test_main", + "//mace/core:test_benchmark_main", ], ) diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index ca215d5adf03941c1475b1c1a931c0bbfd58ff8d..de8293e35421ac29031db5281d162de3999efb78 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -17,9 +17,41 @@ cc_library( deps = [ "//mace/core:core", ], - copts = ['-std=c++11'] + if_android([ - "-mfpu=neon", # TODO recheck the flags - "-mfloat-abi=hard", - ]), + copts = ['-std=c++11'], ) +cc_test( + name = "kernel_test", + srcs = glob(["test/*.cc"]), + deps = [ + "@gtest//:gtest_main", + ":kernels", + "//mace/core:core", + ], + copts = ['-std=c++11'], + linkopts = ["-fopenmp"] + if_android([ + "-pie", + "-llog", + "-lm", + ]), + linkstatic = 1, + testonly = 1, +) + +cc_test( + name = "benchmark", + srcs = glob(["benchmark/*.cc"]), + deps = [ + ":kernels", + "//mace/core:core", + "//mace/core:test_benchmark_main", + ], + copts = ['-std=c++11'], + linkopts = ["-fopenmp"] + if_android([ + "-pie", + "-llog", + "-lm", + ]), + linkstatic = 1, + testonly = 1, +) diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h new file mode 100644 index 0000000000000000000000000000000000000000..70a0d584e080c63ce5b8cce3623cae844496e111 --- /dev/null +++ b/mace/kernels/addn.h @@ -0,0 +1,36 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_ADDN_H_ +#define MACE_KERNELS_ADDN_H_ + +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + +template +void AddNFuntion(const vector& input_tensor, Tensor *output_tensor) { + int n = input_tensor.size(); + CHECK(n > 1); + CHECK_NOTNULL(input_tensor[0]); + int64 size = input_tensor[0]->size(); + vector inputs(n); + for (int i = 0; i < n; ++i) { + inputs[i] = input_tensor[i]->data(); + } + output_tensor->ResizeLike(input_tensor[0]); + float* output = output_tensor->mutable_data(); + + for (int i = 0; i < n; ++i) { + for (int64 j = 0; j < size; ++j) { + output[j] += inputs[i][j]; + } + } +} + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ADDN_H_ \ No newline at end of file diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..f63fed77b11847f3aacca8291c333699b0bd840a --- /dev/null +++ b/mace/kernels/benchmark/addn_benchmark.cc @@ -0,0 +1,55 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/testing/test_benchmark.h" +#include "mace/core/tensor.h" +#include "mace/kernels/addn.h" +#include "mace/kernels/neon/addn_neon.h" + +using namespace mace; +using namespace mace::kernels; + +static void AddNBenchmark(int iters, int n, int type) { + const int64 tot = static_cast(iters) * n * 3; + mace::testing::ItemsProcessed(tot); + mace::testing::BytesProcessed(tot * (sizeof(float))); + + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + + Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); + input_tensor1.Resize({n}); + Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); + input_tensor2.Resize({n}); + Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT); + input_tensor3.Resize({n}); + vector input_tensors {&input_tensor1, + &input_tensor2, + &input_tensor3}; + Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); + output_tensor.ResizeLike(input_tensor1); + float *input1 = input_tensor1.mutable_data(); + float *input2 = input_tensor2.mutable_data(); + float *input3 = input_tensor3.mutable_data(); + float *output = output_tensor.mutable_data(); + + for (int64 i = 0; i < n; ++i) { + input1[i] = nd(gen); + input2[i] = nd(gen); + input3[i] = nd(gen); + } + + if (type == DeviceType::CPU) { + AddNFuntion(input_tensors, &output_tensor); + } else if (type == DeviceType::NEON) { + NeonAddNFuntion_float(input_tensors, &output_tensor); + } +} + +static const int kBenchmarkSize = 10000000; + +BENCHMARK(AddNBenchmark) + ->ArgPair(kBenchmarkSize, DeviceType::CPU) + ->ArgPair(kBenchmarkSize, DeviceType::NEON); diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..9276cadc737bba60a0fac81893dd5aa797d3f6a9 --- /dev/null +++ b/mace/kernels/benchmark/relu_benchmark.cc @@ -0,0 +1,43 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/testing/test_benchmark.h" +#include "mace/core/tensor.h" +#include "mace/kernels/relu.h" +#include "mace/kernels/neon/relu_neon.h" + +using namespace mace; +using namespace mace::kernels; + +static void ReluBenchmark(int iters, int n, int type) { + const int64 tot = static_cast(iters) * n; + mace::testing::ItemsProcessed(tot); + mace::testing::BytesProcessed(tot * (sizeof(float))); + + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + + Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); + input_tensor.Resize({n}); + Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); + output_tensor.ResizeLike(input_tensor); + float *input = input_tensor.mutable_data(); + float *output = output_tensor.mutable_data(); + for (int64 i = 0; i < n; ++i) { + input[i] = nd(gen); + } + + if (type == DeviceType::CPU) { + ReluFuntion(&input_tensor, &output_tensor); + } else if (type == DeviceType::NEON) { + NeonReluFuntion_float(&input_tensor, &output_tensor); + } +} + +static const int kBenchmarkSize = 10000000; + +BENCHMARK(ReluBenchmark) + ->ArgPair(kBenchmarkSize, DeviceType::CPU) + ->ArgPair(kBenchmarkSize, DeviceType::NEON); diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc new file mode 100644 index 0000000000000000000000000000000000000000..d04464f4f0d659c9376f8ecd72d1f991e5dd27b7 --- /dev/null +++ b/mace/kernels/neon/addn_neon.cc @@ -0,0 +1,59 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/kernels/neon/addn_neon.h" +#include "mace/core/common.h" + +namespace mace { +namespace kernels { + +void NeonAddNFuntion_float(const vector &input_tensor, + Tensor *output_tensor) { + int n = input_tensor.size(); + CHECK(n > 1); + CHECK_NOTNULL(input_tensor[0]); + int64 size = input_tensor[0]->size(); + output_tensor->ResizeLike(input_tensor[0]); + float *output = output_tensor->mutable_data(); + vector inputs(n); + for (int i = 0; i < n; ++i) { + inputs[i] = input_tensor[i]->data(); + } + + int64 cost = size * n; + int64 groups = 1; + if (cost > kCostPerGroup) { + groups = cost / kCostPerGroup; + } + int64 element_per_group = size / groups; + +#pragma omp parallel for num_threads(1) // no significant performance improve + for (int64 i = 0; i < size; i += element_per_group) { + int64 count = std::min(element_per_group, size - i); + int nn = count >> 2; + int remain = count - (nn << 2); + for (int64 j = 0; j < n; ++j) { + const float *inptr = inputs[j] + i; + float *outptr = output + i; + for (int k = 0; k < nn; ++k) { + float32x4_t _inptr = vld1q_f32(inptr); + float32x4_t _outptr = vld1q_f32(outptr); + _outptr = vaddq_f32(_outptr, _inptr); + vst1q_f32(outptr, _outptr); + + inptr += 4; + outptr += 4; + } + for (int k = 0; k < remain; ++k) { + *outptr += *inptr; + ++inptr; + ++outptr; + } + } + } +} + +} // namespace kernels +} // namespace mace \ No newline at end of file diff --git a/mace/kernels/neon/addn_neon.h b/mace/kernels/neon/addn_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..aa987d635ad5f7ed47c98d0a2dee9ec223e6b9bd --- /dev/null +++ b/mace/kernels/neon/addn_neon.h @@ -0,0 +1,19 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_ADDN_NEON_H_ +#define MACE_KERNELS_ADDN_NEON_H_ + +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + +void NeonAddNFuntion_float(const vector &input_tensor, + Tensor *output_tensor); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ADDN_NEON_H_ diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index 845422179e9866f8ba14df1e111f2f497fb747da..29c4e354a783f945f25e25ce9de75b879776f737 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -6,23 +6,36 @@ #include "mace/kernels/neon/relu_neon.h" namespace mace { -namespace kernels{ +namespace kernels { void NeonReluFuntion_float(const Tensor *input_tensor, Tensor *output_tensor) { int64 size = input_tensor->size(); output_tensor->ResizeLike(input_tensor); - const float* input = input_tensor->data(); - float* output = output_tensor->mutable_data(); + const float *input = input_tensor->data(); + float *output = output_tensor->mutable_data(); - float32x4_t _zero = vdupq_n_f32(0.f); - for (; size > 0; size--) { - float32x4_t _inp = vld1q_f32(input); - float32x4_t _outp = vmaxq_f32(_inp, _zero); - vst1q_f32(output, _outp); +#pragma omp parallel for num_threads(1) // no significant performance improve + for (int64 i = 0; i < size; i += kCostPerGroup) { + int64 count = std::min(static_cast(kCostPerGroup), size - i); + int nn = count >> 2; + int remain = count - (nn << 2); + const float *inptr = input + i; + float *outptr = output + i; + float32x4_t _zero = vdupq_n_f32(0.f); + for (; nn > 0; --nn) { + float32x4_t _inptr = vld1q_f32(inptr); + float32x4_t _outptr = vmaxq_f32(_inptr, _zero); + vst1q_f32(outptr, _outptr); - input += 4; - output += 4; + inptr += 4; + outptr += 4; + } + for (; remain > 0; --remain) { + *outptr = std::max(*inptr, 0.f); + ++inptr; + ++outptr; + } } } diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h index cc613f1dc867b5cf14d9d51d830506c87b39a93e..e2400e97fd2602378313a20d2b037252ec98ccb7 100644 --- a/mace/kernels/relu.h +++ b/mace/kernels/relu.h @@ -14,8 +14,8 @@ template void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { int64 size = input_tensor->size(); output_tensor->ResizeLike(input_tensor); - const float* input = input_tensor->data(); - float* output = output_tensor->mutable_data(); + const float *input = input_tensor->data(); + float *output = output_tensor->mutable_data(); for (int64 i = 0; i < size; ++i) { output[i] = std::max(input[i], static_cast(0)); diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8d1ca924b9b3ef8fecf96301007afa593cd54600 --- /dev/null +++ b/mace/kernels/test/addn_neon_test.cc @@ -0,0 +1,55 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// +#include +#include +#include "gtest/gtest.h" +#include "mace/kernels/neon/addn_neon.h" +#include "mace/kernels/addn.h" + +using namespace mace; +using namespace mace::kernels; + +TEST(NeonTest, AddN) { + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + + int64 count = 100000; + Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); + input_tensor1.Resize({100, 1000}); + Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); + input_tensor2.ResizeLike(input_tensor1); + Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT); + input_tensor3.ResizeLike(input_tensor1); + vector input_tensors {&input_tensor1, + &input_tensor2, + &input_tensor3}; + + Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); + output_tensor.ResizeLike(input_tensors[0]); + Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT); + output_tensor_neon.ResizeLike(input_tensors[0]); + + float *input1 = input_tensor1.mutable_data(); + float *input2 = input_tensor2.mutable_data(); + float *input3 = input_tensor3.mutable_data(); + float *output = output_tensor.mutable_data(); + float *output_neon = output_tensor_neon.mutable_data(); + + for (int64 i = 0; i < count; ++i) { + input1[i] = nd(gen); + input2[i] = nd(gen); + input3[i] = nd(gen); + } + + AddNFuntion(input_tensors, &output_tensor); + NeonAddNFuntion_float(input_tensors, &output_tensor_neon); + + ASSERT_EQ(count, output_tensor.size()); + ASSERT_EQ(count, output_tensor_neon.size()); + for (int64 i = 0; i < count; ++i) { + ASSERT_FLOAT_EQ(output[i], output_neon[i]); + } +} + diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..40c1bc62d68a94820ac99d1140203c24dd412235 --- /dev/null +++ b/mace/kernels/test/relu_neon_test.cc @@ -0,0 +1,43 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// +#include +#include +#include "gtest/gtest.h" +#include "mace/kernels/neon/relu_neon.h" +#include "mace/kernels/relu.h" + +using namespace mace; +using namespace mace::kernels; + +TEST(NeonTest, Relu) { + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + + int64 count = 100000; + Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); + input_tensor.Resize({100, 1000}); + Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); + output_tensor.ResizeLike(input_tensor); + Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT); + output_tensor_neon.ResizeLike(input_tensor); + + float *input = input_tensor.mutable_data(); + float *output = output_tensor.mutable_data(); + float *output_neon = output_tensor_neon.mutable_data(); + + for (int64 i = 0; i < count; ++i) { + input[i] = nd(gen); + } + + ReluFuntion(&input_tensor, &output_tensor); + NeonReluFuntion_float(&input_tensor, &output_tensor_neon); + + ASSERT_EQ(count, output_tensor.size()); + ASSERT_EQ(count, output_tensor_neon.size()); + for (int64 i = 0; i < count; ++i) { + ASSERT_FLOAT_EQ(output[i], output_neon[i]); + } +} +