提交 6a4f0c12 编写于 作者: L Liangliang He

Merge branch 'master' into 'master'

Implement AddN Op

See merge request !9
...@@ -23,7 +23,7 @@ cc_library( ...@@ -23,7 +23,7 @@ cc_library(
# Main program for tests # Main program for tests
cc_library( cc_library(
name = "test_main", name = "test_benchmark_main",
testonly = 1, testonly = 1,
srcs = glob([ srcs = glob([
"testing/*.cc", "testing/*.cc",
......
...@@ -33,4 +33,6 @@ private: \ ...@@ -33,4 +33,6 @@ private: \
#define MACE_NOT_IMPLEMENTED REQUIRE(false, "not implemented") #define MACE_NOT_IMPLEMENTED REQUIRE(false, "not implemented")
#define kCostPerGroup 8192
#endif // MACE_CORE_COMMON_H_ #endif // MACE_CORE_COMMON_H_
...@@ -28,6 +28,6 @@ cc_test( ...@@ -28,6 +28,6 @@ cc_test(
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/core", "//mace/core",
"//mace/core:test_main", "//mace/core:test_benchmark_main",
], ],
) )
...@@ -17,9 +17,41 @@ cc_library( ...@@ -17,9 +17,41 @@ cc_library(
deps = [ deps = [
"//mace/core:core", "//mace/core:core",
], ],
copts = ['-std=c++11'] + if_android([ copts = ['-std=c++11'],
"-mfpu=neon", # TODO recheck the flags
"-mfloat-abi=hard",
]),
) )
cc_test(
name = "kernel_test",
srcs = glob(["test/*.cc"]),
deps = [
"@gtest//:gtest_main",
":kernels",
"//mace/core:core",
],
copts = ['-std=c++11'],
linkopts = ["-fopenmp"] + if_android([
"-pie",
"-llog",
"-lm",
]),
linkstatic = 1,
testonly = 1,
)
cc_test(
name = "benchmark",
srcs = glob(["benchmark/*.cc"]),
deps = [
":kernels",
"//mace/core:core",
"//mace/core:test_benchmark_main",
],
copts = ['-std=c++11'],
linkopts = ["-fopenmp"] + if_android([
"-pie",
"-llog",
"-lm",
]),
linkstatic = 1,
testonly = 1,
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
template<typename T>
void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tensor) {
int n = input_tensor.size();
CHECK(n > 1);
CHECK_NOTNULL(input_tensor[0]);
int64 size = input_tensor[0]->size();
vector<const T*> inputs(n);
for (int i = 0; i < n; ++i) {
inputs[i] = input_tensor[i]->data<float>();
}
output_tensor->ResizeLike(input_tensor[0]);
float* output = output_tensor->mutable_data<T>();
for (int i = 0; i < n; ++i) {
for (int64 j = 0; j < size; ++j) {
output[j] += inputs[i][j];
}
}
}
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ADDN_H_
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/addn.h"
#include "mace/kernels/neon/addn_neon.h"
using namespace mace;
using namespace mace::kernels;
static void AddNBenchmark(int iters, int n, int type) {
const int64 tot = static_cast<int64>(iters) * n * 3;
mace::testing::ItemsProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
input_tensor1.Resize({n});
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
input_tensor2.Resize({n});
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT);
input_tensor3.Resize({n});
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor1);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
for (int64 i = 0; i < n; ++i) {
input1[i] = nd(gen);
input2[i] = nd(gen);
input3[i] = nd(gen);
}
if (type == DeviceType::CPU) {
AddNFuntion<float>(input_tensors, &output_tensor);
} else if (type == DeviceType::NEON) {
NeonAddNFuntion_float(input_tensors, &output_tensor);
}
}
static const int kBenchmarkSize = 10000000;
BENCHMARK(AddNBenchmark)
->ArgPair(kBenchmarkSize, DeviceType::CPU)
->ArgPair(kBenchmarkSize, DeviceType::NEON);
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/relu.h"
#include "mace/kernels/neon/relu_neon.h"
using namespace mace;
using namespace mace::kernels;
static void ReluBenchmark(int iters, int n, int type) {
const int64 tot = static_cast<int64>(iters) * n;
mace::testing::ItemsProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
input_tensor.Resize({n});
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
for (int64 i = 0; i < n; ++i) {
input[i] = nd(gen);
}
if (type == DeviceType::CPU) {
ReluFuntion<float>(&input_tensor, &output_tensor);
} else if (type == DeviceType::NEON) {
NeonReluFuntion_float(&input_tensor, &output_tensor);
}
}
static const int kBenchmarkSize = 10000000;
BENCHMARK(ReluBenchmark)
->ArgPair(kBenchmarkSize, DeviceType::CPU)
->ArgPair(kBenchmarkSize, DeviceType::NEON);
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include "mace/kernels/neon/addn_neon.h"
#include "mace/core/common.h"
namespace mace {
namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
Tensor *output_tensor) {
int n = input_tensor.size();
CHECK(n > 1);
CHECK_NOTNULL(input_tensor[0]);
int64 size = input_tensor[0]->size();
output_tensor->ResizeLike(input_tensor[0]);
float *output = output_tensor->mutable_data<float>();
vector<const float *> inputs(n);
for (int i = 0; i < n; ++i) {
inputs[i] = input_tensor[i]->data<float>();
}
int64 cost = size * n;
int64 groups = 1;
if (cost > kCostPerGroup) {
groups = cost / kCostPerGroup;
}
int64 element_per_group = size / groups;
#pragma omp parallel for num_threads(1) // no significant performance improve
for (int64 i = 0; i < size; i += element_per_group) {
int64 count = std::min(element_per_group, size - i);
int nn = count >> 2;
int remain = count - (nn << 2);
for (int64 j = 0; j < n; ++j) {
const float *inptr = inputs[j] + i;
float *outptr = output + i;
for (int k = 0; k < nn; ++k) {
float32x4_t _inptr = vld1q_f32(inptr);
float32x4_t _outptr = vld1q_f32(outptr);
_outptr = vaddq_f32(_outptr, _inptr);
vst1q_f32(outptr, _outptr);
inptr += 4;
outptr += 4;
}
for (int k = 0; k < remain; ++k) {
*outptr += *inptr;
++inptr;
++outptr;
}
}
}
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_ADDN_NEON_H_
#define MACE_KERNELS_ADDN_NEON_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ADDN_NEON_H_
...@@ -6,23 +6,36 @@ ...@@ -6,23 +6,36 @@
#include "mace/kernels/neon/relu_neon.h" #include "mace/kernels/neon/relu_neon.h"
namespace mace { namespace mace {
namespace kernels{ namespace kernels {
void NeonReluFuntion_float(const Tensor *input_tensor, void NeonReluFuntion_float(const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
int64 size = input_tensor->size(); int64 size = input_tensor->size();
output_tensor->ResizeLike(input_tensor); output_tensor->ResizeLike(input_tensor);
const float* input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float* output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
float32x4_t _zero = vdupq_n_f32(0.f); #pragma omp parallel for num_threads(1) // no significant performance improve
for (; size > 0; size--) { for (int64 i = 0; i < size; i += kCostPerGroup) {
float32x4_t _inp = vld1q_f32(input); int64 count = std::min(static_cast<int64>(kCostPerGroup), size - i);
float32x4_t _outp = vmaxq_f32(_inp, _zero); int nn = count >> 2;
vst1q_f32(output, _outp); int remain = count - (nn << 2);
const float *inptr = input + i;
float *outptr = output + i;
float32x4_t _zero = vdupq_n_f32(0.f);
for (; nn > 0; --nn) {
float32x4_t _inptr = vld1q_f32(inptr);
float32x4_t _outptr = vmaxq_f32(_inptr, _zero);
vst1q_f32(outptr, _outptr);
input += 4; inptr += 4;
output += 4; outptr += 4;
}
for (; remain > 0; --remain) {
*outptr = std::max(*inptr, 0.f);
++inptr;
++outptr;
}
} }
} }
......
...@@ -14,8 +14,8 @@ template<typename T> ...@@ -14,8 +14,8 @@ template<typename T>
void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) {
int64 size = input_tensor->size(); int64 size = input_tensor->size();
output_tensor->ResizeLike(input_tensor); output_tensor->ResizeLike(input_tensor);
const float* input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float* output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
for (int64 i = 0; i < size; ++i) { for (int64 i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0)); output[i] = std::max(input[i], static_cast<T>(0));
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/neon/addn_neon.h"
#include "mace/kernels/addn.h"
using namespace mace;
using namespace mace::kernels;
TEST(NeonTest, AddN) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
int64 count = 100000;
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
input_tensor1.Resize({100, 1000});
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
input_tensor2.ResizeLike(input_tensor1);
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT);
input_tensor3.ResizeLike(input_tensor1);
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensors[0]);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensors[0]);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
for (int64 i = 0; i < count; ++i) {
input1[i] = nd(gen);
input2[i] = nd(gen);
input3[i] = nd(gen);
}
AddNFuntion<float>(input_tensors, &output_tensor);
NeonAddNFuntion_float(input_tensors, &output_tensor_neon);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64 i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/neon/relu_neon.h"
#include "mace/kernels/relu.h"
using namespace mace;
using namespace mace::kernels;
TEST(NeonTest, Relu) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
int64 count = 100000;
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
input_tensor.Resize({100, 1000});
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
for (int64 i = 0; i < count; ++i) {
input[i] = nd(gen);
}
ReluFuntion<float>(&input_tensor, &output_tensor);
NeonReluFuntion_float(&input_tensor, &output_tensor_neon);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64 i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册