提交 d5c39c6d 编写于 作者: 李寅

Add conv5x5s1; clone tensor

上级 e14852b3
......@@ -32,6 +32,7 @@ private: \
#define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
#define kCostPerGroup 8192
// TODO: need to fine tune this
#define kCostPerGroup 1024000000
#endif // MACE_CORE_COMMON_H_
......@@ -159,6 +159,20 @@ class Tensor {
LOG(INFO) << os.str();
}
inline size_t SizeOfType() {
size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T));
return type_size;
}
inline void Copy(const Tensor& other) {
alloc_ = other.alloc_;
dtype_ = other.dtype_;
ResizeLike(other);
const void* other_data = other.raw_data();
memcpy(raw_mutable_data(), other_data, size_ * SizeOfType());
}
private:
inline int64_t NumElements() const {
return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
......@@ -169,6 +183,8 @@ class Tensor {
DataType dtype_;
std::shared_ptr<void> data_;
vector<index_t> shape_;
DISABLE_COPY_AND_ASSIGN(Tensor);
};
} // namespace tensor
......
......@@ -14,7 +14,8 @@ template<DeviceType D, typename T>
struct AddNFunctor {
void operator()(const vector<const T*>& inputs,
T *output, index_t size) {
int n = inputs.size();
memset(output, 0, size * sizeof(T));
int n = inputs.size();
for (int i = 0; i < n; ++i) {
for (index_t j = 0; j < size; ++j) {
output[j] += inputs[i][j];
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/addn.h"
using namespace mace;
using namespace mace::kernels;
static void AddNBenchmark(int iters, int n, int type) {
const int64_t tot = static_cast<int64_t>(iters) * n * 3;
mace::testing::ItemsProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
vector<float> input1(n);
vector<float> input2(n);
vector<float> input3(n);
vector<float> output(n);
for (int64_t i = 0; i < n; ++i) {
input1[i] = nd(gen);
input2[i] = nd(gen);
input3[i] = nd(gen);
}
vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
if (type == DeviceType::CPU) {
AddNFunctor<DeviceType::CPU, float> addn_functor;
while (--iters) {
addn_functor(inputs, &output[0], n);
}
} else if (type == DeviceType::NEON) {
AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
while (--iters) {
neon_addn_functor(inputs, &output[0], n);
}
}
}
static const int kBenchmarkSize = 10000000;
BENCHMARK(AddNBenchmark)
->ArgPair(kBenchmarkSize, DeviceType::CPU)
->ArgPair(kBenchmarkSize, DeviceType::NEON);
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/relu.h"
using namespace mace;
using namespace mace::kernels;
static void ReluBenchmark(int iters, int n, int type) {
const int64_t tot = static_cast<int64_t>(iters) * n;
mace::testing::ItemsProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
vector<float> input(n);
vector<float> output(n);
for (int64_t i = 0; i < n; ++i) {
input[i] = nd(gen);
}
if (type == DeviceType::CPU) {
ReluFunctor<DeviceType::CPU, float> relu_functor;
while (--iters) {
relu_functor(&input[0], &output[0], n);
}
} else if (type == DeviceType::NEON) {
ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
while (--iters) {
neon_relu_functor(&input[0], &output[0], n);
}
}
}
static const int kBenchmarkSize = 10000000;
BENCHMARK(ReluBenchmark)
->ArgPair(kBenchmarkSize, DeviceType::CPU)
->ArgPair(kBenchmarkSize, DeviceType::NEON);
......@@ -12,6 +12,8 @@ template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>& inputs,
float *output,
index_t size) {
// TODO: neon mem copy
memset(output, 0, size * sizeof(float));
int n = inputs.size();
int64_t cost = size * n;
int64_t groups = 1;
......
......@@ -44,6 +44,7 @@ static inline void ConstructInputWithPadding(const float* input,
}
}
extern void Conv2dNeonK1x1S1(const float* input, const index_t* input_shape,
const float* filter, const float* bias,
float* output, const index_t* output_shape);
......@@ -52,6 +53,10 @@ extern void Conv2dNeonK3x3S1(const float* input, const index_t* input_shape,
const float* filter, const float* bias,
float* output, const index_t* output_shape);
extern void Conv2dNeonK5x5S1(const float* input, const index_t* input_shape,
const float* filter, const float* bias,
float* output, const index_t* output_shape);
template<>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float* input, // NCHW
const index_t* input_shape,
......@@ -86,7 +91,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float* input, // N
nullptr
},
{
nullptr,
Conv2dNeonK5x5S1,
nullptr
}
};
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
#define MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
#include <arm_neon.h>
#include "mace/core/common.h"
namespace mace {
namespace kernels {
void Conv2dNeonK5x5S1(const float* input, // NCHW
const index_t* input_shape,
const float* filter, // c_out, c_in, kernel_h, kernel_w
const float* bias, // c_out
float* output, // NCHW
const index_t* output_shape) {
const index_t batch = output_shape[0];
const index_t channels = output_shape[1];
const index_t height = output_shape[2];
const index_t width = output_shape[3];
const index_t input_batch = input_shape[0];
const index_t input_channels = input_shape[1];
const index_t input_height = input_shape[2];
const index_t input_width = input_shape[3];
MACE_ASSERT(input_batch == batch);
const index_t input_total_pixels_per_channel = input_height * input_width;
const index_t output_total_pixels_per_channel = height * width;
const index_t input_total_pixels_per_batch = input_total_pixels_per_channel
* input_channels;
const index_t output_total_pixels_per_batch = output_total_pixels_per_channel
* channels;
const index_t patch_size = input_channels * 25;
#pragma omp parallel for collapse(2)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < channels; ++c) {
float* output_ptr = output + n * output_total_pixels_per_batch
+ c * output_total_pixels_per_channel;
const float* input_ptr = input + n * input_total_pixels_per_batch;
// Fill with bias
for (index_t i = 0; i < output_total_pixels_per_channel; ++i) {
output_ptr[i] = bias[c];
}
for (index_t inc = 0; inc < input_channels; ++inc) {
float* outptr = output_ptr;
float* outptr2 = outptr + width;
const float* inptr = input_ptr + inc * input_total_pixels_per_channel;
const float* filter_ptr = filter + c * patch_size + inc * 25;
const float* r0 = inptr;
const float* r1 = inptr + input_width;
const float* r2 = inptr + input_width * 2;
const float* r3 = inptr + input_width * 3;
const float* r4 = inptr + input_width * 4;
const float* r5 = inptr + input_width * 5;
const float* k0 = filter_ptr;
const float* k1 = filter_ptr + 5;
const float* k2 = filter_ptr + 10;
const float* k3 = filter_ptr + 15;
const float* k4 = filter_ptr + 20;
float32x4_t _k0123 = vld1q_f32(filter_ptr);
float32x4_t _k4567 = vld1q_f32(filter_ptr + 4);
float32x4_t _k891011 = vld1q_f32(filter_ptr + 8);
float32x4_t _k12131415 = vld1q_f32(filter_ptr + 12);
float32x4_t _k16171819 = vld1q_f32(filter_ptr + 16);
float32x4_t _k20212223 = vld1q_f32(filter_ptr + 20);
float32x4_t _k24242424 = vdupq_n_f32(filter_ptr[24]);
// height_block_size = 2, width_block_size = 4
int h = 0;
for (; h + 1 < height; h += 2) {
int width_blocks = width >> 2;
int remain = width - (width_blocks << 2);
for (; width_blocks > 0; --width_blocks) {
float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _sum2 = vld1q_f32(outptr2);
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
float32x4_t _r50 = vld1q_f32(r5);
float32x4_t _r54 = vld1q_f32(r5 + 4);
float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0);
vst1q_f32(outptr, _sum);
vst1q_f32(outptr2, _sum2);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
r5 += 4;
outptr += 4;
outptr2 += 4;
}
for (; remain > 0; --remain) {
float sum = 0;
float sum2 = 0;
float32x4_t _r1 = vld1q_f32(r1);
float32x4_t _k1 = vld1q_f32(k1);
float32x4_t _sum = vmulq_f32(_r1, _k1);
float32x4_t _sum2 = vmulq_f32(_r1, _k0123);
float32x4_t _r2 = vld1q_f32(r2);
float32x4_t _k2 = vld1q_f32(k2);
_sum = vmlaq_f32(_sum, _r2, _k2);
_sum2 = vmlaq_f32(_sum2, _r2, _k1);
float32x4_t _r3 = vld1q_f32(r3);
float32x4_t _k3 = vld1q_f32(k3);
_sum = vmlaq_f32(_sum, _r3, _k3);
_sum2 = vmlaq_f32(_sum2, _r3, _k2);
float32x4_t _r4 = vld1q_f32(r4);
_sum = vmlaq_f32(_sum, _r4, _k20212223);
_sum2 = vmlaq_f32(_sum2, _r4, _k3);
float32x4_t _r0 = vld1q_f32(r0);
_sum = vmlaq_f32(_sum, _r0, _k0123);
float32x4_t _r5 = vld1q_f32(r5);
_sum2 = vmlaq_f32(_sum2, _r5, _k20212223);
float32x4_t _k_t4;
_k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
_k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
_k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
_k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
float32x4_t _r_t4;
_r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
_r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
_r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
_r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
_sum = vmlaq_f32(_sum, _r_t4, _k_t4);
sum = r4[4] * k4[4];
_r_t4 = vextq_f32(_r_t4, _r_t4, 1);
_r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
_sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);
sum2 = r5[4] * k4[4];
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
float32x2_t
_ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);
sum += vget_lane_f32(_ss_ss2, 0);
sum2 += vget_lane_f32(_ss_ss2, 1);
*outptr += sum;
*outptr2 += sum2;
++r0;
++r1;
++r2;
++r3;
++r4;
++r5;
++outptr;
++outptr2;
}
r0 += 4 + input_width; // 4 = 5 - 1
r1 += 4 + input_width;
r2 += 4 + input_width;
r3 += 4 + input_width;
r4 += 4 + input_width;
r5 += 4 + input_width;
outptr += width;
outptr2 += width;
}
for (; h < height; ++h) {
// may left one row if odd rows
int width_blocks = width >> 2;
int remain = width - (width_blocks << 2);
for (; width_blocks > 0; --width_blocks) {
float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
vst1q_f32(outptr, _sum);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
r5 += 4;
outptr += 4;
}
for (; remain > 0; --remain) {
float sum = 0;
float32x4_t _r0 = vld1q_f32(r0);
float32x4_t _sum = vmulq_f32(_r0, _k0123);
float debug[4];
vst1q_f32(debug, _sum);
float32x4_t _r1 = vld1q_f32(r1);
_sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
float32x4_t _r2 = vld1q_f32(r2);
_sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
float32x4_t _r3 = vld1q_f32(r3);
_sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));
float32x4_t _r4 = vld1q_f32(r4);
_sum = vmlaq_f32(_sum, _r4, _k20212223);
float32x4_t _k_t4;
_k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
_k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
_k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
_k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
float32x4_t _r_t4;
_r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
_r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
_r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
_r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
_sum = vmlaq_f32(_sum, _r_t4, _k_t4);
sum = r4[4] * k4[4];
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
_ss = vpadd_f32(_ss, _ss);
sum += vget_lane_f32(_ss, 0);
*outptr += sum;
++r0;
++r1;
++r2;
++r3;
++r4;
++outptr;
}
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
}
}
}
}
}
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/addn.h"
using namespace mace;
using namespace mace::kernels;
TEST(NeonTest, AddN) {
testing::internal::LogToStderr();
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
int64_t count = 100000;
vector<float> input1(count);
vector<float> input2(count);
vector<float> input3(count);
vector<float> output(count);
vector<float> output_neon(count);
for (int64_t i = 0; i < count; ++i) {
input1[i] = nd(gen);
input2[i] = nd(gen);
input3[i] = nd(gen);
}
vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
AddNFunctor<DeviceType::CPU, float> addn_functor;
AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
addn_functor(inputs, &output[0], count);
neon_addn_functor(inputs, &output_neon[0], count);
for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/relu.h"
using namespace mace;
using namespace mace::kernels;
TEST(NeonTest, Relu) {
testing::internal::LogToStderr();
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
int64_t count = 100000;
vector<float> input(count);
vector<float> output(count);
vector<float> output_neon(count);
for (int64_t i = 0; i < count; ++i) {
input[i] = nd(gen);
}
ReluFunctor<DeviceType::CPU, float> relu_functor;
ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
relu_functor(&input[0], &output[0], count);
neon_relu_functor(&input[0], &output_neon[0], count);
for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int n, int size) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < n; ++i) {
op_def_builder.Input(internal::MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.Finalize(net.operator_def());
// Add input data
for (int i = 0; i < n; ++i) {
net.AddRandomInput<float>(internal::MakeString("Input", i).c_str(), {size});
}
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
mace::testing::StartTiming();
while(iters--) {
net.RunOp(D);
}
}
#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \
static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * SIZE; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE); \
} \
BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
#define BM_ADDN(N, SIZE, TYPE) \
BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
BM_ADDN(10, 1000, float);
BM_ADDN(10, 10000, float);
BM_ADDN(100, 1000, float);
BM_ADDN(100, 10000, float);
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
class AddnOpTest : public OpsTestBase {};
TEST_F(AddnOpTest, AddnOp) {
// Construct graph
auto& net = test_net();
OpDefBuilder("AddN", "AddNTest")
.Input("Input1")
.Input("Input2")
.Input("Input3")
.Output("Output")
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input1", {1, 2, 3, 4});
net.AddRandomInput<float>("Input2", {1, 2, 3, 4});
net.AddRandomInput<float>("Input3", {1, 2, 3, 4});
// Run
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Check
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
}
} // namespace mace
......@@ -33,11 +33,11 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 1, 6, 2},
auto expected = CreateTensor<float>({1, 1, 6, 2},
{-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(BatchNormOpTest, SimpleNeon) {
......@@ -70,12 +70,12 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
net.RunOp();
// Check
Tensor expected = *net.GetOutput("Output");
Tensor* expected = net.GetOutput("Output");
// Run NEON
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-5);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
}
......@@ -66,5 +66,9 @@ BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
} // namespace mace
......@@ -43,9 +43,9 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Simple_SAME) {
......@@ -81,12 +81,12 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 1, 3, 3},
auto expected = CreateTensor<float>({1, 1, 3, 3},
{ 8.1f, 12.1f, 8.1f,
12.1f, 18.1f, 12.1f,
8.1f, 12.1f, 8.1f});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Combined) {
......@@ -127,7 +127,7 @@ TEST_F(Conv2dOpTest, Combined) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 2, 3, 3},
auto expected = CreateTensor<float>({1, 2, 3, 3},
{ 8.1f, 12.1f, 8.1f,
12.1f, 18.1f, 12.1f,
8.1f, 12.1f, 8.1f,
......@@ -136,7 +136,7 @@ TEST_F(Conv2dOpTest, Combined) {
4.2f, 6.2f, 4.2f});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, Conv1x1) {
......@@ -180,7 +180,7 @@ TEST_F(Conv2dOpTest, Conv1x1) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 2, 3, 10},
auto expected = CreateTensor<float>({1, 2, 3, 10},
{5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
......@@ -188,11 +188,12 @@ TEST_F(Conv2dOpTest, Conv1x1) {
10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
// TODO we need more tests
TEST_F(Conv2dOpTest, ConvNxNS12) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w,
int stride_h, int stride_w,
Padding type) {
......@@ -205,13 +206,13 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
index_t width = 7 + rand() % 100;
index_t output_channels = 1 + rand() % 50;
// Construct graph
auto& net = test_net();
auto &net = test_net();
OpDefBuilder("Conv2d", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.Finalize(net.operator_def());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.Finalize(net.operator_def());
// Add args
net.AddIntsArg("strides", {stride_h, stride_w});
......@@ -227,20 +228,15 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
net.RunOp();
// Check
// TODO(liyin) Copy the tensor
Tensor tmp = *net.GetOutput("Output");
Tensor expected;
expected.ResizeLike(tmp);
expected.Copy(tmp.data<float>(), tmp.size());
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-3);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
};
for (int kernel_size : {1, 3}) {
for (int kernel_size : {1, 3, 5}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
......
......@@ -79,6 +79,15 @@ class OpsTestNet {
[&gen, &nd, positive] { return positive ? std::abs(nd(gen)) : nd(gen); });
}
template<typename T>
void AddFixedInput(const char *name, const std::vector<index_t> &shape, T value) {
Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
input->Resize(shape);
float *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), value);
}
void AddIntArg(const char *name, const int value) {
auto arg = op_def_.add_arg();
arg->set_name(name);
......@@ -169,10 +178,10 @@ class OpsTestBase : public ::testing::Test {
};
template<typename T>
Tensor CreateTensor(const std::vector<index_t> &shape, const std::vector<T> &data) {
Tensor res(cpu_allocator(), DataTypeToEnum<T>::v());
res.Resize(shape);
float *input_data = res.mutable_data<float>();
unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape, const std::vector<T> &data) {
unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v()));
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
return res;
}
......
......@@ -43,10 +43,10 @@ TEST_F(PoolingOpTest, MAX_VALID) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 2, 2, 2},
auto expected = CreateTensor<float>({1, 2, 2, 2},
{5, 7, 13, 15, 21, 23, 29, 31});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -80,10 +80,10 @@ TEST_F(PoolingOpTest, AVG_VALID) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 2, 2, 2},
auto expected = CreateTensor<float>({1, 2, 2, 2},
{2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_SAME) {
......@@ -111,10 +111,10 @@ TEST_F(PoolingOpTest, MAX_SAME) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 1, 2, 2},
auto expected = CreateTensor<float>({1, 1, 2, 2},
{4, 5, 7, 8});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
......@@ -143,8 +143,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 1, 2, 2},
auto expected = CreateTensor<float>({1, 1, 2, 2},
{10, 11, 14, 15});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void ReluBenchmark(int iters, int size) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder("Relu", "ReluBM")
.Input("Input")
.Output("Output")
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input", {size});
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
mace::testing::StartTiming();
while(iters--) {
net.RunOp(D);
}
}
#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \
static void BM_RELU_##SIZE##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, SIZE); \
} \
BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE)
#define BM_RELU(SIZE, TYPE) \
BM_RELU_MACRO(SIZE, TYPE, CPU); \
BM_RELU_MACRO(SIZE, TYPE, NEON);
BM_RELU(1000, float);
BM_RELU(100000, float);
BM_RELU(10000000, float);
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
class ReluOpTest : public OpsTestBase {};
TEST_F(ReluOpTest, ReluOp) {
// Construct graph
auto& net = test_net();
OpDefBuilder("Relu", "ReluTest")
.Input("Input")
.Output("Output")
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input", {1, 2, 3, 4});
// Run
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Check
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
}
} // namespace mace
......@@ -30,9 +30,9 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWOAlignCorners) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18});
auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
......@@ -57,7 +57,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
net.RunOp();
// Check
Tensor expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19});
auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册