diff --git a/mace/examples/BUILD b/mace/examples/BUILD index d55eb4f20360226b4b53ee479ad56e5c6dc68879..91a1969d72a0b4a050bb0a848458263cefec8f67 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -6,26 +6,26 @@ cc_binary( srcs = [ "helloworld.cc", ], + copts = ["-std=c++11"], + linkopts = ["-fopenmp"] + if_android(["-ldl"]), deps = [ "//mace/core", "//mace/ops", "@org_tensorflow//tensorflow/core:android_tensorflow_lib", ], - copts = ["-std=c++11"], - linkopts = ["-fopenmp",] + if_android(["-ldl"]), ) cc_test( name = "benchmark_example", + testonly = 1, srcs = ["benchmark_example.cc"], + copts = ["-std=c++11"], + linkopts = ["-fopenmp"] + if_android(["-ldl"]), + linkstatic = 1, deps = [ "//mace/core", "//mace/core:test_benchmark_main", ], - copts = ["-std=c++11"], - linkopts = ["-fopenmp",] + if_android(["-ldl"]), - linkstatic = 1, - testonly = 1, ) cc_binary( @@ -33,12 +33,12 @@ cc_binary( srcs = [ "mace_run.cc", ], + copts = ["-std=c++11"], + linkopts = ["-fopenmp"] + if_android(["-ldl"]), + linkstatic = 1, deps = [ "//mace/core", - "//mace/utils", "//mace/ops", + "//mace/utils:command_line_flags", ], - copts = ["-std=c++11",], - linkopts = ["-fopenmp",] + if_android(["-ldl"]), - linkstatic = 1, ) diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 37d8936a33d4886204539331e15d40009a4ccc0c..92d687619d8553453d83614f38eda70e2915a48d 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -5,7 +5,6 @@ package( default_visibility = ["//visibility:public"], ) - licenses(["notice"]) # Apache 2.0 load("//mace:mace.bzl", "if_android") @@ -14,36 +13,40 @@ cc_library( name = "kernels", srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])), hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])), - deps = [ - "//mace/core:core", + copts = [ + "-std=c++11", + "-fopenmp", ], - copts = ['-std=c++11', "-fopenmp",], linkopts = if_android(["-lm"]), + deps = [ + "//mace/core", + "//mace/utils:utils", + ], ) cc_test( name = "kernel_test", + testonly = 1, srcs = glob(["test/*.cc"]), + copts = ["-std=c++11"], + linkopts = if_android(["-pie"]), + linkstatic = 1, deps = [ - "@gtest//:gtest_main", ":kernels", - "//mace/core:core", + "//mace/core", + "@gtest//:gtest_main", ], - copts = ['-std=c++11'], - linkopts = if_android(["-pie"]), - linkstatic = 1, - testonly = 1, ) cc_test( name = "benchmark", + testonly = 1, srcs = glob(["benchmark/*.cc"]), + copts = ["-std=c++11"], + linkstatic = 1, deps = [ ":kernels", - "//mace/core:core", + "//mace/core", "//mace/core:test_benchmark_main", ], - copts = ['-std=c++11'], - linkstatic = 1, - testonly = 1, ) diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 84312a03d2e59d10fd76eec93f9e4cff2199696a..be50df0fc1172fb413d2954c2cee6e49efbe2d53 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -33,6 +33,7 @@ struct BatchNormFunctor { // new_offset = \offset - mean * common_val; // Y = new_scale * X + new_offset; T new_scale, new_offset; +#pragma omp parallel for for (index_t c = 0; c < channel; ++c) { new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon); new_offset = offset[c] - mean[c] * new_scale; diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index dcf652486c074d42cb9777b6970d857b92d75a64..0a2941669b8e17f92bc9d0929c0b67317969c06e 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -19,7 +19,7 @@ struct ConcatFunctor { T *output) { const size_t input_count = input_list.size(); for (int inner_idx = 0; inner_idx < inner_dim; ++inner_idx) { - for (int i = 0; i < input_count; ++i) { + for (size_t i = 0; i < input_count; ++i) { if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { memcpy(output, input_list[i], outer_dims[i] * sizeof(T)); output += outer_dims[i]; diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc index 0b121a70dc9b3b99892e086b525830364c7040a8..cba69533648499e19abb99886e26f06110d2c187 100644 --- a/mace/kernels/neon/batch_norm_neon.cc +++ b/mace/kernels/neon/batch_norm_neon.cc @@ -31,6 +31,7 @@ void BatchNormFunctor::operator()( float new_scale, new_offset; index_t count = sample_size >> 2; index_t remain_count = sample_size - (count << 2); +#pragma omp parallel for for (index_t c = 0; c < channel; ++c) { new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon); new_offset = offset[c] - mean[c] * new_scale; diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc index e3fd325b40a3b611a174569dc919a6e0ca736870..b4c2b1644a1ca2d3263308734b2f6791221a7403 100644 --- a/mace/kernels/neon/conv_2d_neon_1x1.cc +++ b/mace/kernels/neon/conv_2d_neon_1x1.cc @@ -4,11 +4,297 @@ #include #include "mace/core/common.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { +static constexpr index_t kInputChannelBlockSize = 2; +static constexpr index_t kOutputChannelBlockSize = 4; +static __attribute__((__aligned__(64))) int32_t mask_array[8] = { + 0, 0, 0, 0, -1, -1, -1, -1 +}; + +static inline void NeonConv2x4Kernel(index_t input_channels, + index_t pixel_size, + const float *input, + const float *filter, + float *output) { + const float *input0 = input; + const float *input1 = input + pixel_size; + + const float32x2_t vfilter0x = vld1_f32(filter); + filter += input_channels; + const float32x2_t vfilter1x = vld1_f32(filter); + filter += input_channels; + const float32x2_t vfilter2x = vld1_f32(filter); + filter += input_channels; + const float32x2_t vfilter3x = vld1_f32(filter); + + float *output0 = output; + float *output1 = output0 + pixel_size; + float *output2 = output1 + pixel_size; + float *output3 = output2 + pixel_size; + while (pixel_size >= 4) { + float32x4_t voutput0 = vld1q_f32(output0); + float32x4_t voutput1 = vld1q_f32(output1); + float32x4_t voutput2 = vld1q_f32(output2); + float32x4_t voutput3 = vld1q_f32(output3); + + const float32x4_t vinput0 = vld1q_f32(input0); + input0 += 4; + voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); + voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); + voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); + voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); + + const float32x4_t vinput1 = vld1q_f32(input1); + input1 += 4; + voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); + voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); + voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); + voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); + + vst1q_f32(output0, voutput0); + output0 += 4; + vst1q_f32(output1, voutput1); + output1 += 4; + vst1q_f32(output2, voutput2); + output2 += 4; + vst1q_f32(output3, voutput3); + output3 += 4; + + pixel_size -= 4; + } + if (pixel_size != 0) { + const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]); + + output0 = output0 + pixel_size - 4; + float32x4_t voutput0 = vld1q_f32(output0); + output1 = output1 + pixel_size - 4; + float32x4_t voutput1 = vld1q_f32(output1); + output2 = output2 + pixel_size - 4; + float32x4_t voutput2 = vld1q_f32(output2); + output3 = output3 + pixel_size - 4; + float32x4_t voutput3 = vld1q_f32(output3); + + const float32x4_t vinput0 = vreinterpretq_f32_s32( + vandq_s32(vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4])))); + voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); + voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); + voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); + voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); + + const float32x4_t vinput1 = vreinterpretq_f32_s32( + vandq_s32(vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4])))); + voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); + voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); + voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); + voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); + + vst1q_f32(output0, voutput0); + vst1q_f32(output1, voutput1); + vst1q_f32(output2, voutput2); + vst1q_f32(output3, voutput3); + } +} + +static inline void NeonConv2x4SubBlockKernel(index_t input_channels_subblock_size, + index_t output_channels_subblock_size, + index_t input_channels, + index_t pixel_size, + const float *input, + const float *filter, + float *output) { + const float *input0 = input; + const float *input1 = input + pixel_size; + + float32x2_t vfilter0x, vfilter1x, vfilter2x, vfilter3x; + vfilter0x = vld1_dup_f32(&filter[0]); + if (input_channels_subblock_size > 1) { + vfilter0x = vld1_lane_f32(&filter[1], vfilter0x, 1); + } + if (output_channels_subblock_size > 1) { + filter += input_channels; + vfilter1x = vld1_dup_f32(&filter[0]); + if (input_channels_subblock_size > 1) { + vfilter1x = vld1_lane_f32(&filter[1], vfilter1x, 1); + } + if (output_channels_subblock_size > 2) { + filter += input_channels; + vfilter2x = vld1_dup_f32(&filter[0]); + if (input_channels_subblock_size > 1) { + vfilter2x = vld1_lane_f32(&filter[1], vfilter2x, 1); + } + if (output_channels_subblock_size > 3) { + filter += input_channels; + vfilter3x = vld1_dup_f32(&filter[0]); + if (input_channels_subblock_size > 1) { + vfilter3x = vld1_lane_f32(&filter[1], vfilter3x, 1); + } + } + } + } + + float *output0 = output; + float *output1 = output0 + pixel_size; + float *output2 = output1 + pixel_size; + float *output3 = output2 + pixel_size; + while (pixel_size >= 4) { + float32x4_t voutput0, voutput1, voutput2, voutput3; + voutput0 = vld1q_f32(output0); + if (output_channels_subblock_size > 1) { + voutput1 = vld1q_f32(output1); + if (output_channels_subblock_size > 2) { + voutput2 = vld1q_f32(output2); + if (output_channels_subblock_size > 3) { + voutput3 = vld1q_f32(output3); + } + } + } + + const float32x4_t vinput0 = vld1q_f32(input0); + input0 += 4; + voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); + voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); + voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); + voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); + + if (input_channels_subblock_size > 1) { + const float32x4_t vinput1 = vld1q_f32(input1); + input1 += 4; + voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); + voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); + voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); + voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); + } + + vst1q_f32(output0, voutput0); + output0 += 4; + if (output_channels_subblock_size > 1) { + vst1q_f32(output1, voutput1); + output1 += 4; + if (output_channels_subblock_size > 2) { + vst1q_f32(output2, voutput2); + output2 += 4; + if (output_channels_subblock_size > 3) { + vst1q_f32(output3, voutput3); + output3 += 4; + } + } + } + + pixel_size -= 4; + } + if (pixel_size != 0) { + const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]); + + float32x4_t voutput0, voutput1, voutput2, voutput3; + output0 += pixel_size - 4; + voutput0 = vld1q_f32(output0); + if (output_channels_subblock_size > 1) { + output1 += pixel_size - 4; + voutput1 = vld1q_f32(output1); + if (output_channels_subblock_size > 2) { + output2 += pixel_size - 4; + voutput2 = vld1q_f32(output2); + if (output_channels_subblock_size > 3) { + output3 += pixel_size - 4; + voutput3 = vld1q_f32(output3); + } + } + } + + const float32x4_t vinput0 = vreinterpretq_f32_s32( + vandq_s32(vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4])))); + voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); + voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); + voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); + voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); + + if (input_channels_subblock_size > 1) { + const float32x4_t vinput1 = vreinterpretq_f32_s32( + vandq_s32(vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4])))); + voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); + voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); + voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); + voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); + } + + vst1q_f32(output0, voutput0); + if (output_channels_subblock_size > 1) { + vst1q_f32(output1, voutput1); + if (output_channels_subblock_size > 2) { + vst1q_f32(output2, voutput2); + if (output_channels_subblock_size > 3) { + vst1q_f32(output3, voutput3); + } + } + } + } +} void Conv2dNeonK1x1S1(const float *input, // NCHW + const index_t *input_shape, + const float *filter, // c_out, c_in, filter_h, filter_w + const index_t *filter_shape, + const float *bias, // c_out + float *output, // NCHW + const index_t *output_shape) { + const index_t batch = output_shape[0]; + const index_t channels = output_shape[1]; + const index_t height = output_shape[2]; + const index_t width = output_shape[3]; + + const index_t input_batch = input_shape[0]; + const index_t input_channels = input_shape[1]; + const index_t input_height = input_shape[2]; + const index_t input_width = input_shape[3]; + + MACE_CHECK(input_batch == batch && input_height == height && + input_width == width); + + const index_t total_pixels = height * width; + const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize); + +#pragma omp parallel for collapse(2) + for (index_t n = 0; n < batch; ++n) { + for (int i = 0; i < channels; ++i) { + float *output_ptr_base = output + n * channels * total_pixels + i * total_pixels; + std::fill(output_ptr_base, output_ptr_base + total_pixels, bias ? bias[i] : 0); + } + } + // benchmark omp collapsed(2) +#pragma omp parallel for collapse(2) + for (index_t n = 0; n < batch; ++n) { + for (index_t c = 0; c < round_up_channels; c += kOutputChannelBlockSize) { + const float *input_ptr = input + n * input_channels * total_pixels; + const float *filter_ptr = filter + c * input_channels; + float *output_ptr = output + n * channels * total_pixels + c * total_pixels; + const index_t output_channel_block_size = std::min(channels - c, kOutputChannelBlockSize); + index_t remain_input_channels = input_channels; + if (c + kOutputChannelBlockSize <= channels) { + while (remain_input_channels >= kInputChannelBlockSize) { + NeonConv2x4Kernel(input_channels, total_pixels, input_ptr, filter_ptr, output_ptr); + + input_ptr += kInputChannelBlockSize * total_pixels; + filter_ptr += kInputChannelBlockSize; + remain_input_channels -= kInputChannelBlockSize; + } + } + while (remain_input_channels != 0) { + const index_t input_channel_block_size = std::min(remain_input_channels, kInputChannelBlockSize); + NeonConv2x4SubBlockKernel(input_channel_block_size, output_channel_block_size, + input_channels, total_pixels, input_ptr, filter_ptr, output_ptr); + input_ptr += kInputChannelBlockSize * total_pixels; + filter_ptr += kInputChannelBlockSize; + remain_input_channels -= input_channel_block_size; + } + + } + } +}; + +void Conv2dNeonPixelK1x1S1(const float *input, // NCHW const index_t *input_shape, const float *filter, // c_out, c_in, kernel_h, kernel_w const index_t *filter_shape, diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 30376fa01c1ccb9aeb175cd483e5599511f72855..83574b53373fd52226aef06ebd80f392131c6732 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -34,7 +34,10 @@ cc_library( ["*.h"], exclude = ["ops_test_util.h"], ), - copts = ["-std=c++11", "-fopenmp",], + copts = [ + "-std=c++11", + "-fopenmp", + ], deps = [ "//mace/core", "//mace/kernels", @@ -50,7 +53,7 @@ cc_test( ["*_test.cc"], ), copts = ["-std=c++11"], - linkopts = ["-fopenmp",] + if_android(["-ldl"]), + linkopts = ["-fopenmp"] + if_android(["-ldl"]), linkstatic = 1, deps = [ ":ops", @@ -64,7 +67,7 @@ cc_test( testonly = 1, srcs = glob(["*_benchmark.cc"]), copts = ["-std=c++11"], - linkopts = ["-fopenmp",] + if_android(["-ldl"]), + linkopts = ["-fopenmp"] + if_android(["-ldl"]), linkstatic = 1, deps = [ ":ops", diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 079ad6f1a15c82b98487ec3850b21ee29accb19e..16763322c0418ef5cf4618ed0492402fdc08ec4b 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -57,14 +57,16 @@ static void BatchNorm( BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); BM_BATCH_NORM(1, 1, 512, 512, float); -BM_BATCH_NORM(1, 1, 1024, 1024, float); BM_BATCH_NORM(1, 3, 128, 128, float); BM_BATCH_NORM(1, 3, 512, 512, float); -BM_BATCH_NORM(1, 3, 1024, 1024, float); +BM_BATCH_NORM(1, 32, 112, 112, float); BM_BATCH_NORM(1, 64, 256, 256, float); BM_BATCH_NORM(1, 64, 512, 512, float); +BM_BATCH_NORM(1, 128, 56, 56, float); BM_BATCH_NORM(1, 128, 256, 256, float); -BM_BATCH_NORM(1, 128, 512, 512, float); +BM_BATCH_NORM(1, 256, 14, 14, float); +BM_BATCH_NORM(1, 512, 14, 14, float); +BM_BATCH_NORM(1, 1024, 7, 7, float); BM_BATCH_NORM(32, 1, 256, 256, float); BM_BATCH_NORM(32, 3, 256, 256, float); } // namespace mace \ No newline at end of file diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 7356666bb2160f6c6f617899ca22c3b95daa32d8..844fe32eb7e8147c3514de7b11b6e371503611ad 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -72,6 +72,11 @@ static void Conv2d(int iters, BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments +BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); +BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); +BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); +BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); +BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); @@ -86,5 +91,4 @@ BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); - } // namespace mace diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 4bc881398faf7c23340e0da0bd0f3c453d3de1f9..8aaf0d00872edd655835c0ca57af61c4671e86e5 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -165,18 +165,69 @@ TEST_F(Conv2dOpTest, Conv1x1) { } // TODO we need more tests -TEST_F(Conv2dOpTest, ConvNxNS12) { +TEST_F(Conv2dOpTest, IdleConvNxNS12) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type) { srand(time(NULL)); // generate random input - index_t batch = 1 + rand() % 10; - index_t input_channels = 1 + rand() % 10; + index_t batch = 3 ; + index_t input_channels = 64; + index_t height = 32; + index_t width = 32; + index_t output_channels = 128; + // Construct graph + auto& net = test_net(); + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .Finalize(net.operator_def()); + + // Add args + net.AddIntsArg("strides", {stride_h, stride_w}); + net.AddIntArg("padding", type); + net.AddIntsArg("dilations", {1, 1}); + + // Add input data + net.AddRandomInput("Input", {batch, input_channels, height, width}); + net.AddRandomInput( + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); + net.AddRandomInput("Bias", {output_channels}); + // run cpu + net.RunOp(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Run NEON + net.RunOp(DeviceType::NEON); + ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + }; + + for (int kernel_size : {1}) { + for (int stride : {1}) { + func(kernel_size, kernel_size, stride, stride, VALID); + func(kernel_size, kernel_size, stride, stride, SAME); + } + } +} + +TEST_F(Conv2dOpTest, DisgustConvNxNS12) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + rand() % 10; + index_t input_channels = 3 + rand() % 10; index_t height = 107; index_t width = 113; - index_t output_channels = 1 + rand() % 10; + index_t output_channels = 3 + rand() % 10; // Construct graph auto& net = test_net(); OpDefBuilder("Conv2D", "Conv2dTest") diff --git a/mace/utils/BUILD b/mace/utils/BUILD index aa5f65d312ec6db4d856d66dd12da6adf682f6d1..06e2ccc490aef5f1a75920bd7cb0afeb9172f64c 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD @@ -8,15 +8,23 @@ package( licenses(["notice"]) # Apache 2.0 cc_library( - name = "utils", - srcs = glob([ - "*.cc", - ]), - hdrs = glob([ - "*.h", - ]), + name = "command_line_flags", + srcs = [ + "command_line_flags.cc", + ], + hdrs = [ + "command_line_flags.h", + ], copts = ["-std=c++11"], deps = [ - "//mace/core:core", + "//mace/core", + ], +) + +cc_library( + name = "utils", + hdrs = [ + "utils.h", ], -) \ No newline at end of file + copts = ["-std=c++11"], +) diff --git a/mace/utils/utils.h b/mace/utils/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..d8e07e101e2a0db58520b89481b775e5c4c1943e --- /dev/null +++ b/mace/utils/utils.h @@ -0,0 +1,18 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_UTILS_UTILS_H_ +#define MACE_UTILS_UTILS_H_ +namespace mace { +template +Integer RoundUp(Integer i, Integer factor) { + return (i + factor - 1) / factor * factor; +} + +template +Integer CeilQuotient(Integer a, Integer b) { + return (a + b - 1) / b; +} +} // namespace mace +#endif // MACE_UTILS_UTILS_H_