From d5c39c6d67e1c82831d70ba2b6bf57a48f421b82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Thu, 14 Sep 2017 17:56:35 +0800 Subject: [PATCH] Add conv5x5s1; clone tensor --- mace/core/common.h | 3 +- mace/core/tensor.h | 16 + mace/kernels/addn.h | 3 +- mace/kernels/benchmark/addn_benchmark.cc | 50 --- mace/kernels/benchmark/relu_benchmark.cc | 45 --- mace/kernels/neon/addn_neon.cc | 2 + mace/kernels/neon/conv_2d_neon.cc | 7 +- mace/kernels/neon/conv_2d_neon_5x5.cc | 420 +++++++++++++++++++++++ mace/kernels/test/addn_neon_test.cc | 42 --- mace/kernels/test/relu_neon_test.cc | 37 -- mace/ops/addn_benchmark.cc | 58 ++++ mace/ops/addn_test.cc | 39 +++ mace/ops/batch_norm_test.cc | 8 +- mace/ops/conv_2d_benchmark.cc | 4 + mace/ops/conv_2d_test.cc | 40 +-- mace/ops/ops_test_util.h | 17 +- mace/ops/pooling_test.cc | 16 +- mace/ops/relu_benchmark.cc | 53 +++ mace/ops/relu_test.cc | 35 ++ mace/ops/resize_bilinear_test.cc | 8 +- 20 files changed, 684 insertions(+), 219 deletions(-) delete mode 100644 mace/kernels/benchmark/addn_benchmark.cc delete mode 100644 mace/kernels/benchmark/relu_benchmark.cc create mode 100644 mace/kernels/neon/conv_2d_neon_5x5.cc delete mode 100644 mace/kernels/test/addn_neon_test.cc delete mode 100644 mace/kernels/test/relu_neon_test.cc create mode 100644 mace/ops/addn_benchmark.cc create mode 100644 mace/ops/addn_test.cc create mode 100644 mace/ops/relu_benchmark.cc create mode 100644 mace/ops/relu_test.cc diff --git a/mace/core/common.h b/mace/core/common.h index e5e07225..df22eacd 100644 --- a/mace/core/common.h +++ b/mace/core/common.h @@ -32,6 +32,7 @@ private: \ #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented") -#define kCostPerGroup 8192 +// TODO: need to fine tune this +#define kCostPerGroup 1024000000 #endif // MACE_CORE_COMMON_H_ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index d40d39d7..3dc3f1ed 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -159,6 +159,20 @@ class Tensor { LOG(INFO) << os.str(); } + inline size_t SizeOfType() { + size_t type_size = 0; + CASES(dtype_, type_size = sizeof(T)); + return type_size; + } + + inline void Copy(const Tensor& other) { + alloc_ = other.alloc_; + dtype_ = other.dtype_; + ResizeLike(other); + const void* other_data = other.raw_data(); + memcpy(raw_mutable_data(), other_data, size_ * SizeOfType()); + } + private: inline int64_t NumElements() const { return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); @@ -169,6 +183,8 @@ class Tensor { DataType dtype_; std::shared_ptr data_; vector shape_; + + DISABLE_COPY_AND_ASSIGN(Tensor); }; } // namespace tensor diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 3d978d19..f1803ce3 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -14,7 +14,8 @@ template struct AddNFunctor { void operator()(const vector& inputs, T *output, index_t size) { - int n = inputs.size(); + memset(output, 0, size * sizeof(T)); + int n = inputs.size(); for (int i = 0; i < n; ++i) { for (index_t j = 0; j < size; ++j) { output[j] += inputs[i][j]; diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc deleted file mode 100644 index d717c44e..00000000 --- a/mace/kernels/benchmark/addn_benchmark.cc +++ /dev/null @@ -1,50 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/core/testing/test_benchmark.h" -#include "mace/core/tensor.h" -#include "mace/kernels/addn.h" - -using namespace mace; -using namespace mace::kernels; - -static void AddNBenchmark(int iters, int n, int type) { - const int64_t tot = static_cast(iters) * n * 3; - mace::testing::ItemsProcessed(tot); - mace::testing::BytesProcessed(tot * (sizeof(float))); - - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); - - vector input1(n); - vector input2(n); - vector input3(n); - vector output(n); - - for (int64_t i = 0; i < n; ++i) { - input1[i] = nd(gen); - input2[i] = nd(gen); - input3[i] = nd(gen); - } - vector inputs { input1.data(), input2.data(), input3.data() }; - - if (type == DeviceType::CPU) { - AddNFunctor addn_functor; - while (--iters) { - addn_functor(inputs, &output[0], n); - } - } else if (type == DeviceType::NEON) { - AddNFunctor neon_addn_functor; - while (--iters) { - neon_addn_functor(inputs, &output[0], n); - } - } -} - -static const int kBenchmarkSize = 10000000; - -BENCHMARK(AddNBenchmark) - ->ArgPair(kBenchmarkSize, DeviceType::CPU) - ->ArgPair(kBenchmarkSize, DeviceType::NEON); diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc deleted file mode 100644 index 2d5067b9..00000000 --- a/mace/kernels/benchmark/relu_benchmark.cc +++ /dev/null @@ -1,45 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/core/testing/test_benchmark.h" -#include "mace/core/tensor.h" -#include "mace/kernels/relu.h" - -using namespace mace; -using namespace mace::kernels; - -static void ReluBenchmark(int iters, int n, int type) { - const int64_t tot = static_cast(iters) * n; - mace::testing::ItemsProcessed(tot); - mace::testing::BytesProcessed(tot * (sizeof(float))); - - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); - - vector input(n); - vector output(n); - - for (int64_t i = 0; i < n; ++i) { - input[i] = nd(gen); - } - - if (type == DeviceType::CPU) { - ReluFunctor relu_functor; - while (--iters) { - relu_functor(&input[0], &output[0], n); - } - } else if (type == DeviceType::NEON) { - ReluFunctor neon_relu_functor; - while (--iters) { - neon_relu_functor(&input[0], &output[0], n); - } - } -} - -static const int kBenchmarkSize = 10000000; - -BENCHMARK(ReluBenchmark) - ->ArgPair(kBenchmarkSize, DeviceType::CPU) - ->ArgPair(kBenchmarkSize, DeviceType::NEON); diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index 77a7c5c0..86e53bcb 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -12,6 +12,8 @@ template <> void AddNFunctor::operator()(const vector& inputs, float *output, index_t size) { + // TODO: neon mem copy + memset(output, 0, size * sizeof(float)); int n = inputs.size(); int64_t cost = size * n; int64_t groups = 1; diff --git a/mace/kernels/neon/conv_2d_neon.cc b/mace/kernels/neon/conv_2d_neon.cc index 06d3b3e7..15550308 100644 --- a/mace/kernels/neon/conv_2d_neon.cc +++ b/mace/kernels/neon/conv_2d_neon.cc @@ -44,6 +44,7 @@ static inline void ConstructInputWithPadding(const float* input, } } + extern void Conv2dNeonK1x1S1(const float* input, const index_t* input_shape, const float* filter, const float* bias, float* output, const index_t* output_shape); @@ -52,6 +53,10 @@ extern void Conv2dNeonK3x3S1(const float* input, const index_t* input_shape, const float* filter, const float* bias, float* output, const index_t* output_shape); +extern void Conv2dNeonK5x5S1(const float* input, const index_t* input_shape, + const float* filter, const float* bias, + float* output, const index_t* output_shape); + template<> void Conv2dFunctor::operator()(const float* input, // NCHW const index_t* input_shape, @@ -86,7 +91,7 @@ void Conv2dFunctor::operator()(const float* input, // N nullptr }, { - nullptr, + Conv2dNeonK5x5S1, nullptr } }; diff --git a/mace/kernels/neon/conv_2d_neon_5x5.cc b/mace/kernels/neon/conv_2d_neon_5x5.cc new file mode 100644 index 00000000..693f1241 --- /dev/null +++ b/mace/kernels/neon/conv_2d_neon_5x5.cc @@ -0,0 +1,420 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// +#ifndef MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ +#define MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ + +#include +#include "mace/core/common.h" + +namespace mace { +namespace kernels { + +void Conv2dNeonK5x5S1(const float* input, // NCHW + const index_t* input_shape, + const float* filter, // c_out, c_in, kernel_h, kernel_w + const float* bias, // c_out + float* output, // NCHW + const index_t* output_shape) { + const index_t batch = output_shape[0]; + const index_t channels = output_shape[1]; + const index_t height = output_shape[2]; + const index_t width = output_shape[3]; + + const index_t input_batch = input_shape[0]; + const index_t input_channels = input_shape[1]; + const index_t input_height = input_shape[2]; + const index_t input_width = input_shape[3]; + + MACE_ASSERT(input_batch == batch); + + const index_t input_total_pixels_per_channel = input_height * input_width; + const index_t output_total_pixels_per_channel = height * width; + const index_t input_total_pixels_per_batch = input_total_pixels_per_channel + * input_channels; + const index_t output_total_pixels_per_batch = output_total_pixels_per_channel + * channels; + const index_t patch_size = input_channels * 25; + +#pragma omp parallel for collapse(2) + for (index_t n = 0; n < batch; ++n) { + for (index_t c = 0; c < channels; ++c) { + float* output_ptr = output + n * output_total_pixels_per_batch + + c * output_total_pixels_per_channel; + const float* input_ptr = input + n * input_total_pixels_per_batch; + + // Fill with bias + for (index_t i = 0; i < output_total_pixels_per_channel; ++i) { + output_ptr[i] = bias[c]; + } + + for (index_t inc = 0; inc < input_channels; ++inc) { + float* outptr = output_ptr; + float* outptr2 = outptr + width; + + const float* inptr = input_ptr + inc * input_total_pixels_per_channel; + const float* filter_ptr = filter + c * patch_size + inc * 25; + + const float* r0 = inptr; + const float* r1 = inptr + input_width; + const float* r2 = inptr + input_width * 2; + const float* r3 = inptr + input_width * 3; + const float* r4 = inptr + input_width * 4; + const float* r5 = inptr + input_width * 5; + + const float* k0 = filter_ptr; + const float* k1 = filter_ptr + 5; + const float* k2 = filter_ptr + 10; + const float* k3 = filter_ptr + 15; + const float* k4 = filter_ptr + 20; + + float32x4_t _k0123 = vld1q_f32(filter_ptr); + float32x4_t _k4567 = vld1q_f32(filter_ptr + 4); + float32x4_t _k891011 = vld1q_f32(filter_ptr + 8); + float32x4_t _k12131415 = vld1q_f32(filter_ptr + 12); + float32x4_t _k16171819 = vld1q_f32(filter_ptr + 16); + float32x4_t _k20212223 = vld1q_f32(filter_ptr + 20); + float32x4_t _k24242424 = vdupq_n_f32(filter_ptr[24]); + + // height_block_size = 2, width_block_size = 4 + int h = 0; + for (; h + 1 < height; h += 2) { + int width_blocks = width >> 2; + int remain = width - (width_blocks << 2); + + for (; width_blocks > 0; --width_blocks) { + float32x4_t _sum = vld1q_f32(outptr); + float32x4_t _sum2 = vld1q_f32(outptr2); + + float32x4_t _r00 = vld1q_f32(r0); + float32x4_t _r04 = vld1q_f32(r0 + 4); + float32x4_t _r01 = vextq_f32(_r00, _r04, 1); + float32x4_t _r02 = vextq_f32(_r00, _r04, 2); + float32x4_t _r03 = vextq_f32(_r00, _r04, 3); + + float32x4_t _r10 = vld1q_f32(r1); + float32x4_t _r14 = vld1q_f32(r1 + 4); + float32x4_t _r11 = vextq_f32(_r10, _r14, 1); + float32x4_t _r12 = vextq_f32(_r10, _r14, 2); + float32x4_t _r13 = vextq_f32(_r10, _r14, 3); + + float32x4_t _r20 = vld1q_f32(r2); + float32x4_t _r24 = vld1q_f32(r2 + 4); + float32x4_t _r21 = vextq_f32(_r20, _r24, 1); + float32x4_t _r22 = vextq_f32(_r20, _r24, 2); + float32x4_t _r23 = vextq_f32(_r20, _r24, 3); + + float32x4_t _r30 = vld1q_f32(r3); + float32x4_t _r34 = vld1q_f32(r3 + 4); + float32x4_t _r31 = vextq_f32(_r30, _r34, 1); + float32x4_t _r32 = vextq_f32(_r30, _r34, 2); + float32x4_t _r33 = vextq_f32(_r30, _r34, 3); + + float32x4_t _r40 = vld1q_f32(r4); + float32x4_t _r44 = vld1q_f32(r4 + 4); + float32x4_t _r41 = vextq_f32(_r40, _r44, 1); + float32x4_t _r42 = vextq_f32(_r40, _r44, 2); + float32x4_t _r43 = vextq_f32(_r40, _r44, 3); + + float32x4_t _r50 = vld1q_f32(r5); + float32x4_t _r54 = vld1q_f32(r5 + 4); + float32x4_t _r51 = vextq_f32(_r50, _r54, 1); + float32x4_t _r52 = vextq_f32(_r50, _r54, 2); + float32x4_t _r53 = vextq_f32(_r50, _r54, 3); + + _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); + _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); + _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); + _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); + _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); + + _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); + _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); + _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); + _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); + _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); + + _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); + _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); + _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); + _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); + _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); + + _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); + _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); + _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); + _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); + _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); + + _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); + _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); + _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); + _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); + _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); + + _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0); + _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1); + _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2); + _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3); + _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0); + + _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1); + _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2); + _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3); + _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0); + _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1); + + _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2); + _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3); + _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0); + _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1); + _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2); + + _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3); + _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0); + _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1); + _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2); + _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3); + + _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0); + _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1); + _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2); + _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3); + _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0); + + vst1q_f32(outptr, _sum); + vst1q_f32(outptr2, _sum2); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + r5 += 4; + outptr += 4; + outptr2 += 4; + } + + for (; remain > 0; --remain) { + float sum = 0; + float sum2 = 0; + + float32x4_t _r1 = vld1q_f32(r1); + float32x4_t _k1 = vld1q_f32(k1); + float32x4_t _sum = vmulq_f32(_r1, _k1); + float32x4_t _sum2 = vmulq_f32(_r1, _k0123); + + float32x4_t _r2 = vld1q_f32(r2); + float32x4_t _k2 = vld1q_f32(k2); + _sum = vmlaq_f32(_sum, _r2, _k2); + _sum2 = vmlaq_f32(_sum2, _r2, _k1); + + float32x4_t _r3 = vld1q_f32(r3); + float32x4_t _k3 = vld1q_f32(k3); + _sum = vmlaq_f32(_sum, _r3, _k3); + _sum2 = vmlaq_f32(_sum2, _r3, _k2); + + float32x4_t _r4 = vld1q_f32(r4); + _sum = vmlaq_f32(_sum, _r4, _k20212223); + _sum2 = vmlaq_f32(_sum2, _r4, _k3); + + float32x4_t _r0 = vld1q_f32(r0); + _sum = vmlaq_f32(_sum, _r0, _k0123); + float32x4_t _r5 = vld1q_f32(r5); + _sum2 = vmlaq_f32(_sum2, _r5, _k20212223); + + float32x4_t _k_t4; + _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0); + _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1); + _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2); + _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3); + + float32x4_t _r_t4; + + _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0); + _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1); + _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2); + _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3); + _sum = vmlaq_f32(_sum, _r_t4, _k_t4); + + sum = r4[4] * k4[4]; + + _r_t4 = vextq_f32(_r_t4, _r_t4, 1); + _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3); + _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4); + + sum2 = r5[4] * k4[4]; + + float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); + float32x2_t + _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2); + + sum += vget_lane_f32(_ss_ss2, 0); + sum2 += vget_lane_f32(_ss_ss2, 1); + + *outptr += sum; + *outptr2 += sum2; + + ++r0; + ++r1; + ++r2; + ++r3; + ++r4; + ++r5; + ++outptr; + ++outptr2; + } + + r0 += 4 + input_width; // 4 = 5 - 1 + r1 += 4 + input_width; + r2 += 4 + input_width; + r3 += 4 + input_width; + r4 += 4 + input_width; + r5 += 4 + input_width; + outptr += width; + outptr2 += width; + } + + for (; h < height; ++h) { + // may left one row if odd rows + int width_blocks = width >> 2; + int remain = width - (width_blocks << 2); + for (; width_blocks > 0; --width_blocks) { + float32x4_t _sum = vld1q_f32(outptr); + + float32x4_t _r00 = vld1q_f32(r0); + float32x4_t _r04 = vld1q_f32(r0 + 4); + float32x4_t _r01 = vextq_f32(_r00, _r04, 1); + float32x4_t _r02 = vextq_f32(_r00, _r04, 2); + float32x4_t _r03 = vextq_f32(_r00, _r04, 3); + + float32x4_t _r10 = vld1q_f32(r1); + float32x4_t _r14 = vld1q_f32(r1 + 4); + float32x4_t _r11 = vextq_f32(_r10, _r14, 1); + float32x4_t _r12 = vextq_f32(_r10, _r14, 2); + float32x4_t _r13 = vextq_f32(_r10, _r14, 3); + + float32x4_t _r20 = vld1q_f32(r2); + float32x4_t _r24 = vld1q_f32(r2 + 4); + float32x4_t _r21 = vextq_f32(_r20, _r24, 1); + float32x4_t _r22 = vextq_f32(_r20, _r24, 2); + float32x4_t _r23 = vextq_f32(_r20, _r24, 3); + + float32x4_t _r30 = vld1q_f32(r3); + float32x4_t _r34 = vld1q_f32(r3 + 4); + float32x4_t _r31 = vextq_f32(_r30, _r34, 1); + float32x4_t _r32 = vextq_f32(_r30, _r34, 2); + float32x4_t _r33 = vextq_f32(_r30, _r34, 3); + + float32x4_t _r40 = vld1q_f32(r4); + float32x4_t _r44 = vld1q_f32(r4 + 4); + float32x4_t _r41 = vextq_f32(_r40, _r44, 1); + float32x4_t _r42 = vextq_f32(_r40, _r44, 2); + float32x4_t _r43 = vextq_f32(_r40, _r44, 3); + + _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); + _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); + _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); + _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); + _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); + + _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); + _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); + _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); + _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); + _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); + + _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); + _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); + _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); + _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); + _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); + + _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); + _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); + _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); + _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); + _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); + + _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); + _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); + _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); + _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); + _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); + + vst1q_f32(outptr, _sum); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + r5 += 4; + outptr += 4; + } + + for (; remain > 0; --remain) { + float sum = 0; + float32x4_t _r0 = vld1q_f32(r0); + float32x4_t _sum = vmulq_f32(_r0, _k0123); + + float debug[4]; + vst1q_f32(debug, _sum); + + float32x4_t _r1 = vld1q_f32(r1); + _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1)); + + float32x4_t _r2 = vld1q_f32(r2); + _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2)); + + float32x4_t _r3 = vld1q_f32(r3); + _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3)); + + float32x4_t _r4 = vld1q_f32(r4); + _sum = vmlaq_f32(_sum, _r4, _k20212223); + + float32x4_t _k_t4; + _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0); + _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1); + _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2); + _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3); + + float32x4_t _r_t4; + + _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0); + _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1); + _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2); + _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3); + _sum = vmlaq_f32(_sum, _r_t4, _k_t4); + + sum = r4[4] * k4[4]; + + float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); + _ss = vpadd_f32(_ss, _ss); + + sum += vget_lane_f32(_ss, 0); + *outptr += sum; + + ++r0; + ++r1; + ++r2; + ++r3; + ++r4; + ++outptr; + } + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + } + } + } + } +} + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc deleted file mode 100644 index 6aebb901..00000000 --- a/mace/kernels/test/addn_neon_test.cc +++ /dev/null @@ -1,42 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// -#include -#include -#include "gtest/gtest.h" -#include "mace/kernels/addn.h" - -using namespace mace; -using namespace mace::kernels; - -TEST(NeonTest, AddN) { - testing::internal::LogToStderr(); - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); - - int64_t count = 100000; - vector input1(count); - vector input2(count); - vector input3(count); - vector output(count); - vector output_neon(count); - - for (int64_t i = 0; i < count; ++i) { - input1[i] = nd(gen); - input2[i] = nd(gen); - input3[i] = nd(gen); - } - - vector inputs { input1.data(), input2.data(), input3.data() }; - - AddNFunctor addn_functor; - AddNFunctor neon_addn_functor; - addn_functor(inputs, &output[0], count); - neon_addn_functor(inputs, &output_neon[0], count); - - for (int64_t i = 0; i < count; ++i) { - ASSERT_FLOAT_EQ(output[i], output_neon[i]); - } -} - diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc deleted file mode 100644 index d5200ff1..00000000 --- a/mace/kernels/test/relu_neon_test.cc +++ /dev/null @@ -1,37 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// -#include -#include -#include "gtest/gtest.h" -#include "mace/kernels/relu.h" - -using namespace mace; -using namespace mace::kernels; - -TEST(NeonTest, Relu) { - testing::internal::LogToStderr(); - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); - - int64_t count = 100000; - vector input(count); - vector output(count); - vector output_neon(count); - - for (int64_t i = 0; i < count; ++i) { - input[i] = nd(gen); - } - - ReluFunctor relu_functor; - ReluFunctor neon_relu_functor; - - relu_functor(&input[0], &output[0], count); - neon_relu_functor(&input[0], &output_neon[0], count); - - for (int64_t i = 0; i < count; ++i) { - ASSERT_FLOAT_EQ(output[i], output_neon[i]); - } -} - diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc new file mode 100644 index 00000000..8e3f1b29 --- /dev/null +++ b/mace/ops/addn_benchmark.cc @@ -0,0 +1,58 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void AddNBenchmark(int iters, int n, int size) { + + mace::testing::StopTiming(); + + OpsTestNet net; + OpDefBuilder op_def_builder("AddN", "AddNBM"); + for (int i = 0; i < n; ++i) { + op_def_builder.Input(internal::MakeString("Input", i).c_str()); + } + op_def_builder.Output("Output") + .Finalize(net.operator_def()); + + // Add input data + for (int i = 0; i < n; ++i) { + net.AddRandomInput(internal::MakeString("Input", i).c_str(), {size}); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + + mace::testing::StartTiming(); + while(iters--) { + net.RunOp(D); + } +} + +#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE) \ + static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * SIZE; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \ + AddNBenchmark(iters, N, SIZE); \ + } \ + BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE) + +#define BM_ADDN(N, SIZE, TYPE) \ + BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \ + BM_ADDN_MACRO(N, SIZE, TYPE, NEON); + +BM_ADDN(10, 1000, float); +BM_ADDN(10, 10000, float); +BM_ADDN(100, 1000, float); +BM_ADDN(100, 10000, float); +} // namespace mace \ No newline at end of file diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc new file mode 100644 index 00000000..453458ff --- /dev/null +++ b/mace/ops/addn_test.cc @@ -0,0 +1,39 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { + +class AddnOpTest : public OpsTestBase {}; + +TEST_F(AddnOpTest, AddnOp) { + // Construct graph + auto& net = test_net(); + OpDefBuilder("AddN", "AddNTest") + .Input("Input1") + .Input("Input2") + .Input("Input3") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input1", {1, 2, 3, 4}); + net.AddRandomInput("Input2", {1, 2, 3, 4}); + net.AddRandomInput("Input3", {1, 2, 3, 4}); + + // Run + net.RunOp(); + + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Check + net.RunOp(DeviceType::NEON); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 0.01); +} + +} // namespace mace diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 7d74aac4..f4e07416 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -33,11 +33,11 @@ TEST_F(BatchNormOpTest, SimpleCPU) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 1, 6, 2}, + auto expected = CreateTensor({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.01); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } TEST_F(BatchNormOpTest, SimpleNeon) { @@ -70,12 +70,12 @@ TEST_F(BatchNormOpTest, SimpleNeon) { net.RunOp(); // Check - Tensor expected = *net.GetOutput("Output"); + Tensor* expected = net.GetOutput("Output"); // Run NEON net.RunOp(DeviceType::NEON); - ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index cc6ec092..96843971 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -66,5 +66,9 @@ BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); +BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); +BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); +BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); +BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); } // namespace mace diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 40d6e3a6..4dbc5d34 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -43,9 +43,9 @@ TEST_F(Conv2dOpTest, Simple_VALID) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 1, 1, 1}, {18.1f}); + auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(Conv2dOpTest, Simple_SAME) { @@ -81,12 +81,12 @@ TEST_F(Conv2dOpTest, Simple_SAME) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 1, 3, 3}, + auto expected = CreateTensor({1, 1, 3, 3}, { 8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(Conv2dOpTest, Combined) { @@ -127,7 +127,7 @@ TEST_F(Conv2dOpTest, Combined) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 2, 3, 3}, + auto expected = CreateTensor({1, 2, 3, 3}, { 8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f, @@ -136,7 +136,7 @@ TEST_F(Conv2dOpTest, Combined) { 4.2f, 6.2f, 4.2f}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(Conv2dOpTest, Conv1x1) { @@ -180,7 +180,7 @@ TEST_F(Conv2dOpTest, Conv1x1) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 2, 3, 10}, + auto expected = CreateTensor({1, 2, 3, 10}, {5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, @@ -188,11 +188,12 @@ TEST_F(Conv2dOpTest, Conv1x1) { 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } // TODO we need more tests TEST_F(Conv2dOpTest, ConvNxNS12) { + testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type) { @@ -205,13 +206,13 @@ TEST_F(Conv2dOpTest, ConvNxNS12) { index_t width = 7 + rand() % 100; index_t output_channels = 1 + rand() % 50; // Construct graph - auto& net = test_net(); + auto &net = test_net(); OpDefBuilder("Conv2d", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .Finalize(net.operator_def()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .Finalize(net.operator_def()); // Add args net.AddIntsArg("strides", {stride_h, stride_w}); @@ -227,20 +228,15 @@ TEST_F(Conv2dOpTest, ConvNxNS12) { net.RunOp(); // Check - // TODO(liyin) Copy the tensor - Tensor tmp = *net.GetOutput("Output"); Tensor expected; - expected.ResizeLike(tmp); - expected.Copy(tmp.data(), tmp.size()); + expected.Copy(*net.GetOutput("Output")); // Run NEON net.RunOp(DeviceType::NEON); - - ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-3); - + ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); }; - for (int kernel_size : {1, 3}) { + for (int kernel_size : {1, 3, 5}) { for (int stride : {1, 2}) { func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 75e86e22..d2b9a2c1 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -79,6 +79,15 @@ class OpsTestNet { [&gen, &nd, positive] { return positive ? std::abs(nd(gen)) : nd(gen); }); } + template + void AddFixedInput(const char *name, const std::vector &shape, T value) { + Tensor *input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum::v()); + input->Resize(shape); + float *input_data = input->mutable_data(); + + std::fill(input_data, input_data + input->size(), value); + } + void AddIntArg(const char *name, const int value) { auto arg = op_def_.add_arg(); arg->set_name(name); @@ -169,10 +178,10 @@ class OpsTestBase : public ::testing::Test { }; template -Tensor CreateTensor(const std::vector &shape, const std::vector &data) { - Tensor res(cpu_allocator(), DataTypeToEnum::v()); - res.Resize(shape); - float *input_data = res.mutable_data(); +unique_ptr CreateTensor(const std::vector &shape, const std::vector &data) { + unique_ptr res(new Tensor(cpu_allocator(), DataTypeToEnum::v())); + res->Resize(shape); + T *input_data = res->mutable_data(); memcpy(input_data, data.data(), data.size() * sizeof(T)); return res; } diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 761c792e..9b7cc0cf 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -43,10 +43,10 @@ TEST_F(PoolingOpTest, MAX_VALID) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 2, 2, 2}, + auto expected = CreateTensor({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -80,10 +80,10 @@ TEST_F(PoolingOpTest, AVG_VALID) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 2, 2, 2}, + auto expected = CreateTensor({1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(PoolingOpTest, MAX_SAME) { @@ -111,10 +111,10 @@ TEST_F(PoolingOpTest, MAX_SAME) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 1, 2, 2}, + auto expected = CreateTensor({1, 1, 2, 2}, {4, 5, 7, 8}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(PoolingOpTest, MAX_VALID_DILATION) { @@ -143,8 +143,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 1, 2, 2}, + auto expected = CreateTensor({1, 1, 2, 2}, {10, 11, 14, 15}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc new file mode 100644 index 00000000..371c7eca --- /dev/null +++ b/mace/ops/relu_benchmark.cc @@ -0,0 +1,53 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void ReluBenchmark(int iters, int size) { + + mace::testing::StopTiming(); + + OpsTestNet net; + OpDefBuilder("Relu", "ReluBM") + .Input("Input") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {size}); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + + mace::testing::StartTiming(); + while(iters--) { + net.RunOp(D); + } +} + +#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \ + static void BM_RELU_##SIZE##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * SIZE; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \ + ReluBenchmark(iters, SIZE); \ + } \ + BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE) + +#define BM_RELU(SIZE, TYPE) \ + BM_RELU_MACRO(SIZE, TYPE, CPU); \ + BM_RELU_MACRO(SIZE, TYPE, NEON); + +BM_RELU(1000, float); +BM_RELU(100000, float); +BM_RELU(10000000, float); +} // namespace mace \ No newline at end of file diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc new file mode 100644 index 00000000..6ca8f6e3 --- /dev/null +++ b/mace/ops/relu_test.cc @@ -0,0 +1,35 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { + +class ReluOpTest : public OpsTestBase {}; + +TEST_F(ReluOpTest, ReluOp) { + // Construct graph + auto& net = test_net(); + OpDefBuilder("Relu", "ReluTest") + .Input("Input") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {1, 2, 3, 4}); + + // Run + net.RunOp(); + + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Check + net.RunOp(DeviceType::NEON); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 0.01); +} + +} // namespace mace diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 32c91721..4887e136 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -30,9 +30,9 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWOAlignCorners) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18}); + auto expected = CreateTensor({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { @@ -57,7 +57,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { net.RunOp(); // Check - Tensor expected = CreateTensor({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19}); + auto expected = CreateTensor({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -- GitLab