From b649be5f2d4611163e0937b4448f8d17c71188b8 Mon Sep 17 00:00:00 2001 From: yejianwu Date: Mon, 28 May 2018 16:23:11 +0800 Subject: [PATCH] add MACE_ prefix for macros --- mace/core/arg_helper.cc | 1 - mace/core/operator.h | 17 +-- .../hexagon/hexagon_control_wrapper.cc | 20 +-- mace/core/runtime/hexagon/hexagon_nn_ops.h | 4 +- mace/core/tensor.h | 12 +- mace/core/testing/test_benchmark.h | 4 +- mace/core/workspace.cc | 3 +- mace/kernels/arm/conv_2d_neon_5x5.cc | 8 +- mace/kernels/arm/conv_2d_neon_7x7.cc | 32 ++-- mace/kernels/conv_2d.h | 2 +- mace/kernels/conv_pool_2d_util.cc | 10 +- mace/kernels/opencl/deconv_2d_opencl.cc | 5 +- .../kernels/opencl/out_of_range_check_test.cc | 2 +- mace/ops/activation.cc | 30 ++-- mace/ops/activation_benchmark.cc | 140 +++++++++--------- mace/ops/addn.cc | 30 ++-- mace/ops/addn_benchmark.cc | 28 ++-- mace/ops/batch_norm.cc | 30 ++-- mace/ops/batch_norm.h | 4 +- mace/ops/batch_norm_benchmark.cc | 52 +++---- mace/ops/batch_to_space.cc | 30 ++-- mace/ops/batch_to_space.h | 4 +- mace/ops/batch_to_space_benchmark.cc | 35 ++--- mace/ops/bias_add.cc | 30 ++-- mace/ops/bias_add.h | 4 +- mace/ops/bias_add_benchmark.cc | 52 +++---- mace/ops/buffer_to_image.cc | 20 +-- mace/ops/buffer_to_image.h | 4 +- mace/ops/buffer_to_image_benchmark.cc | 44 +++--- mace/ops/channel_shuffle.cc | 30 ++-- mace/ops/channel_shuffle.h | 4 +- mace/ops/channel_shuffle_benchmark.cc | 35 ++--- mace/ops/concat.cc | 30 ++-- mace/ops/concat.h | 2 +- mace/ops/concat_benchmark.cc | 50 +++---- mace/ops/conv_2d.cc | 30 ++-- mace/ops/conv_2d.h | 4 +- mace/ops/conv_2d_benchmark.cc | 75 +++++----- mace/ops/deconv_2d.cc | 30 ++-- mace/ops/deconv_2d.h | 4 +- mace/ops/deconv_2d_benchmark.cc | 44 +++--- mace/ops/depth_to_space.cc | 30 ++-- mace/ops/depth_to_space.h | 4 +- mace/ops/depth_to_space_benchmark.cc | 35 ++--- mace/ops/depthwise_conv2d.cc | 30 ++-- mace/ops/depthwise_conv2d.h | 4 +- mace/ops/depthwise_conv2d_benchmark.cc | 106 ++++++------- mace/ops/eltwise.cc | 30 ++-- mace/ops/eltwise.h | 2 +- mace/ops/eltwise_benchmark.cc | 45 +++--- mace/ops/folded_batch_norm.cc | 30 ++-- mace/ops/folded_batch_norm.h | 4 +- mace/ops/fully_connected.cc | 30 ++-- mace/ops/fully_connected.h | 4 +- mace/ops/fully_connected_benchmark.cc | 40 ++--- mace/ops/image_to_buffer.cc | 20 +-- mace/ops/image_to_buffer.h | 4 +- mace/ops/local_response_norm.cc | 10 +- mace/ops/local_response_norm.h | 4 +- mace/ops/local_response_norm_benchmark.cc | 40 ++--- mace/ops/matmul.cc | 30 ++-- mace/ops/matmul_benchmark.cc | 27 ++-- mace/ops/pad.cc | 30 ++-- mace/ops/pad_benchmark.cc | 34 ++--- mace/ops/pooling.cc | 30 ++-- mace/ops/pooling.h | 4 +- mace/ops/pooling_benchmark.cc | 40 ++--- mace/ops/proposal.cc | 10 +- mace/ops/proposal.h | 4 +- mace/ops/psroi_align.cc | 10 +- mace/ops/psroi_align.h | 4 +- mace/ops/quantize.cc | 30 ++-- mace/ops/quantize.h | 12 +- mace/ops/reshape.cc | 10 +- mace/ops/reshape.h | 4 +- mace/ops/resize_bilinear.cc | 30 ++-- mace/ops/resize_bilinear_benchmark.cc | 49 +++--- mace/ops/slice.cc | 30 ++-- mace/ops/slice.h | 2 +- mace/ops/slice_benchmark.cc | 26 ++-- mace/ops/softmax.cc | 30 ++-- mace/ops/softmax.h | 4 +- mace/ops/softmax_benchmark.cc | 36 ++--- mace/ops/space_to_batch.cc | 30 ++-- mace/ops/space_to_batch.h | 4 +- mace/ops/space_to_batch_benchmark.cc | 38 ++--- mace/ops/space_to_depth.cc | 30 ++-- mace/ops/space_to_depth.h | 4 +- mace/ops/space_to_depth_benchmark.cc | 35 ++--- mace/ops/transpose.cc | 10 +- mace/ops/transpose.h | 4 +- mace/ops/transpose_benchmark.cc | 58 ++++---- mace/ops/winograd_inverse_transform.cc | 20 +-- mace/ops/winograd_inverse_transform.h | 4 +- mace/ops/winograd_transform.cc | 20 +-- mace/ops/winograd_transform.h | 4 +- mace/ops/winograd_transform_benchmark.cc | 38 ++--- 97 files changed, 1134 insertions(+), 1118 deletions(-) diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index 54a9efc9..60fb38f7 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) #undef MACE_GET_REPEATED_ARGUMENT_FUNC - } // namespace mace diff --git a/mace/core/operator.h b/mace/core/operator.h index 06a20d88..3a2285d9 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -124,20 +124,19 @@ class Operator : public OperatorBase { ~Operator() noexcept override {} }; -// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of -// the -// operator's inputs and outputs, in order to avoid confusion. For example, for -// a fully convolution layer that has input, weight and bias, you can define its -// input tags as: -// OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); +// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the +// indices of the operator's inputs and outputs, in order to avoid confusion. +// For example, for a fully convolution layer that has input, weight and bias, +// you can define its input tags as: +// MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); // And in the code, instead of doing // auto& weight = Input(1); // you can now do // auto& weight = Input(WEIGHT); // to make it more clear. -#define OP_INPUT_TAGS(first_input, ...) \ +#define MACE_OP_INPUT_TAGS(first_input, ...) \ enum _InputTags { first_input = 0, __VA_ARGS__ } -#define OP_OUTPUT_TAGS(first_input, ...) \ +#define MACE_OP_OUTPUT_TAGS(first_input, ...) \ enum _OutputTags { first_input = 0, __VA_ARGS__ } class OpKeyBuilder { @@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry, const OperatorDef &, Workspace *); -#define REGISTER_OPERATOR(op_registry, name, ...) \ +#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc index 7293a0aa..54439695 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc @@ -32,7 +32,7 @@ inline int64_t NowMicros() { namespace mace { -#define MAX_NODE 2048 +#define MACE_MAX_NODE 2048 enum { NN_GRAPH_PERFEVENT_CYCLES = 0, @@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() { return hexagon_nn_teardown(nn_id_) == 0; } -#define PRINT_BUFSIZE (2 * 1024 * 1024) +#define MACE_PRINT_BUFSIZE (2 * 1024 * 1024) void HexagonControlWrapper::PrintLog() { char *buf; - if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; + if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast(buf), - PRINT_BUFSIZE) == 0, + MACE_PRINT_BUFSIZE) == 0, "print log error"); LOG(INFO) << std::string(buf); delete[] buf; @@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() { void HexagonControlWrapper::PrintGraph() { LOG(INFO) << "Print Graph"; char *buf; - if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; + if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast(buf), - PRINT_BUFSIZE) == 0, + MACE_PRINT_BUFSIZE) == 0, "print graph error"); LOG(INFO) << std::string(buf); delete[] buf; @@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) { void HexagonControlWrapper::GetPerfInfo() { LOG(INFO) << "Get perf info"; - std::vector perf_info(MAX_NODE); + std::vector perf_info(MACE_MAX_NODE); unsigned int n_items = 0; - MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, + MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MACE_MAX_NODE, &n_items) == 0, "get perf info error"); @@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() { perf_info[i].counter_lo) * 1.0f / perf_info[i].executions; - char node_type_buf[MAX_NODE]; - hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE); + char node_type_buf[MACE_MAX_NODE]; + hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MACE_MAX_NODE); std::string node_type(node_type_buf); LOG(INFO) << "node id: " << perf_info[i].node_id << ", node type: " << node_type diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_nn_ops.h index deaba665..7f800550 100644 --- a/mace/core/runtime/hexagon/hexagon_nn_ops.h +++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h @@ -22,7 +22,7 @@ namespace mace { -#define OP_INVALID -1 +#define MACE_OP_INVALID -1 typedef enum op_type_enum { #define DEF_OP(NAME, ...) OP_##NAME, @@ -48,7 +48,7 @@ class OpMap { return op_map_[op_type]; } else { LOG(ERROR) << "DSP unsupoorted op type: " << op_type; - return OP_INVALID; + return MACE_OP_INVALID; } } diff --git a/mace/core/tensor.h b/mace/core/tensor.h index f2f9763a..0090e958 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -30,9 +30,9 @@ #ifdef MACE_ENABLE_NEON // Avoid over-bound accessing memory -#define EXTRA_BUFFER_PAD_SIZE 64 +#define MACE_EXTRA_BUFFER_PAD_SIZE 64 #else -#define EXTRA_BUFFER_PAD_SIZE 0 +#define MACE_EXTRA_BUFFER_PAD_SIZE 0 #endif namespace mace { @@ -210,16 +210,16 @@ class Tensor { image_shape_.clear(); if (buffer_ != nullptr) { MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); - if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { + if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " - << raw_size() + EXTRA_BUFFER_PAD_SIZE; - return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE); + << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; + return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); } return MaceStatus::MACE_SUCCESS; } else { MACE_CHECK(is_buffer_owner_); buffer_ = new Buffer(allocator_); - return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE); + return buffer_->Allocate(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); } } diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index 54edb681..96b883a4 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -21,8 +21,8 @@ #include #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c -#define BENCHMARK(n) \ - static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \ +#define MACE_BENCHMARK(n) \ + static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \ __benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n))) namespace mace { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 02354d50..3d2566e6 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::unique_ptr tensor_buf( new Buffer(GetDeviceAllocator(device_type))); MaceStatus status = tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE); + mem_block.x() * GetEnumTypeSize(dtype) + + MACE_EXTRA_BUFFER_PAD_SIZE); if (status != MaceStatus::MACE_SUCCESS) { return status; } diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc index 61672bd4..7a32a291 100644 --- a/mace/kernels/arm/conv_2d_neon_5x5.cc +++ b/mace/kernels/arm/conv_2d_neon_5x5.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { -#define Conv2dNeonK5x5SnLoadCalc4 \ +#define MACE_Conv2dNeonK5x5SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf10, vf20, vf30; \ float32x2_t vf01, vf11, vf21, vf31; \ @@ -62,7 +62,7 @@ namespace kernels { vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); -#define Conv2dNeonK5x5SnLoadCalc1 \ +#define MACE_Conv2dNeonK5x5SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00; \ float32x2_t vf01; \ @@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input, vi2 = vextq_f32(vi0, vi4, 2); vi3 = vextq_f32(vi0, vi4, 3); - Conv2dNeonK5x5SnLoadCalc4; + MACE_Conv2dNeonK5x5SnLoadCalc4; in_offset += in_width; filter_ptr0 += 5; @@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input, vi2 = vextq_f32(vi0, vi4, 2); vi3 = vextq_f32(vi0, vi4, 3); - Conv2dNeonK5x5SnLoadCalc1; + MACE_Conv2dNeonK5x5SnLoadCalc1; in_offset += in_width; filter_ptr0 += 5; diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc index b6c2d5fd..8488127b 100644 --- a/mace/kernels/arm/conv_2d_neon_7x7.cc +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { -#define Conv2dArmv8NeonK7x7SnLoadCalc4 \ +#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ float32x4_t vf10, vf11; \ @@ -72,7 +72,7 @@ namespace kernels { vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#define Conv2dArmv8NeonK7x7SnLoadCalc1 \ +#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ @@ -87,7 +87,7 @@ namespace kernels { vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#define Conv2dArmv7NeonK7x7SnLoadCalc4 \ +#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ float32x4_t vf10, vf11; \ @@ -138,7 +138,7 @@ namespace kernels { vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \ vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#define Conv2dArmv7NeonK7x7SnLoadCalc1 \ +#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ @@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input, vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input, vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; @@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; @@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 85db043c..99973a45 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -465,7 +465,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { padded_input_size = batch * input_channels * (input_height + pad_top + pad_bottom) * (input_width + pad_left + pad_right) * sizeof(float) + - EXTRA_BUFFER_PAD_SIZE; + MACE_EXTRA_BUFFER_PAD_SIZE; total_scratch_size += padded_input_size; } if (extra_output_height != height || extra_output_width != width) { diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index b7a63a91..0d385401 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, // Skip the padded top rows if (padding_same_value) { -#define COPY_INPUT \ +#define MACE_COPY_INPUT \ std::fill(output_data, output_data + padded_left, input[0]); \ output_data += padded_left; \ memcpy(output_data, input, width * sizeof(float)); \ @@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { for (int k = 0; k < padded_top; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; } for (int k = 0; k < height; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; input += width; } input -= width; for (int k = 0; k < padded_bottom; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; } input += width; } } -#undef COPY_INPUT +#undef MACE_COPY_INPUT } else { output_data += padded_top * output_width; for (int i = 0; i < batch; ++i) { diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index 946b77af..f45c2824 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); MACE_CHECK(stride > 0, "stride should > 0."); -#define WIDTH_BLK 5 +#define MACE_WIDTH_BLK 5 const index_t n_strides = (width + stride - 1) / stride; - const index_t width_blocks = ((n_strides + WIDTH_BLK -1)/ WIDTH_BLK) * stride; + const index_t width_blocks = + ((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride; const float stride_r = 1.f / static_cast(stride); const int padding_h = (paddings[0]+1) >> 1; const int padding_w = (paddings[0]+1) >> 1; diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 467a5953..1a6bbe6e 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer, class OutOfRangeCheckTest : public ::testing::Test { protected: virtual void SetUp() { - setenv("MACE_OUT_OF_RANGE_CHECK", "1", 1); + setenv("OUT_OF_RANGE_CHECK", "1", 1); } }; diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 0c90bc9e..37fd8117 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Activation(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ActivationOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ActivationOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ActivationOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index e4ff005a..341b5f71 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELU(N, C, H, W) \ - BM_RELU_MACRO(N, C, H, W, float, CPU); \ - BM_RELU_MACRO(N, C, H, W, float, GPU); \ - BM_RELU_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_RELU(N, C, H, W) \ + MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_RELU_MACRO(N, C, H, W, half, GPU); -BM_RELU(1, 1, 512, 512); -BM_RELU(1, 3, 128, 128); -BM_RELU(1, 3, 512, 512); -BM_RELU(1, 32, 112, 112); -BM_RELU(1, 64, 256, 256); +MACE_BM_RELU(1, 1, 512, 512); +MACE_BM_RELU(1, 3, 128, 128); +MACE_BM_RELU(1, 3, 512, 512); +MACE_BM_RELU(1, 32, 112, 112); +MACE_BM_RELU(1, 64, 256, 256); namespace { template @@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluxBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELUX(N, C, H, W) \ - BM_RELUX_MACRO(N, C, H, W, float, CPU); \ - BM_RELUX_MACRO(N, C, H, W, float, GPU); \ - BM_RELUX_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_RELUX(N, C, H, W) \ + MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_RELUX_MACRO(N, C, H, W, half, GPU); -BM_RELUX(1, 1, 512, 512); -BM_RELUX(1, 3, 128, 128); -BM_RELUX(1, 3, 512, 512); -BM_RELUX(1, 32, 112, 112); -BM_RELUX(1, 64, 256, 256); +MACE_BM_RELUX(1, 1, 512, 512); +MACE_BM_RELUX(1, 3, 128, 128); +MACE_BM_RELUX(1, 3, 512, 512); +MACE_BM_RELUX(1, 32, 112, 112); +MACE_BM_RELUX(1, 64, 256, 256); namespace { template @@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ PreluBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_PRELU(N, C, H, W) \ - BM_PRELU_MACRO(N, C, H, W, float, CPU); \ - BM_PRELU_MACRO(N, C, H, W, float, GPU); \ - BM_PRELU_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_PRELU(N, C, H, W) \ + MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_PRELU_MACRO(N, C, H, W, half, GPU); -BM_PRELU(1, 1, 512, 512); -BM_PRELU(1, 3, 128, 128); -BM_PRELU(1, 3, 512, 512); -BM_PRELU(1, 32, 112, 112); -BM_PRELU(1, 64, 256, 256); +MACE_BM_PRELU(1, 1, 512, 512); +MACE_BM_PRELU(1, 3, 128, 128); +MACE_BM_PRELU(1, 3, 512, 512); +MACE_BM_PRELU(1, 32, 112, 112); +MACE_BM_PRELU(1, 64, 256, 256); namespace { template @@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TanhBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_TANH(N, C, H, W) \ - BM_TANH_MACRO(N, C, H, W, float, CPU); \ - BM_TANH_MACRO(N, C, H, W, float, GPU); \ - BM_TANH_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_TANH(N, C, H, W) \ + MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_TANH_MACRO(N, C, H, W, half, GPU); -BM_TANH(1, 1, 512, 512); -BM_TANH(1, 3, 128, 128); -BM_TANH(1, 3, 512, 512); -BM_TANH(1, 32, 112, 112); -BM_TANH(1, 64, 256, 256); +MACE_BM_TANH(1, 1, 512, 512); +MACE_BM_TANH(1, 3, 128, 128); +MACE_BM_TANH(1, 3, 512, 512); +MACE_BM_TANH(1, 32, 112, 112); +MACE_BM_TANH(1, 64, 256, 256); namespace { template @@ -343,26 +347,26 @@ void SigmoidBenchmark( } } // namespace -#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SigmoidBenchmark(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) - -#define BM_SIGMOID(N, C, H, W) \ - BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ - BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ - BM_SIGMOID_MACRO(N, C, H, W, half, GPU); - -BM_SIGMOID(1, 1, 512, 512); -BM_SIGMOID(1, 3, 128, 128); -BM_SIGMOID(1, 3, 512, 512); -BM_SIGMOID(1, 32, 112, 112); -BM_SIGMOID(1, 64, 256, 256); +#define MACE_BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SigmoidBenchmark(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_SIGMOID(N, C, H, W) \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, half, GPU); + +MACE_BM_SIGMOID(1, 1, 512, 512); +MACE_BM_SIGMOID(1, 3, 128, 128); +MACE_BM_SIGMOID(1, 3, 512, 512); +MACE_BM_SIGMOID(1, 32, 112, 112); +MACE_BM_SIGMOID(1, 64, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 1ad27c2f..6bfc4c09 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_AddN(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + AddNOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + AddNOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + AddNOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 36c9948e..1b72c791 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { } } // namespace -#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ - static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ +#define MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * INPUTS * N * H * W * C; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ AddNBenchmark(iters, INPUTS, N, H, W, C); \ } \ - BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_ADDN(INPUTS, N, H, W, C) \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); +#define MACE_BM_ADDN(INPUTS, N, H, W, C) \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); -BM_ADDN(2, 1, 256, 256, 32); -BM_ADDN(2, 1, 128, 128, 32); -BM_ADDN(4, 1, 128, 128, 3); -BM_ADDN(2, 1, 256, 256, 3); -BM_ADDN(2, 1, 512, 512, 3); +MACE_BM_ADDN(2, 1, 256, 256, 32); +MACE_BM_ADDN(2, 1, 128, 128, 32); +MACE_BM_ADDN(4, 1, 128, 128, 3); +MACE_BM_ADDN(2, 1, 256, 256, 3); +MACE_BM_ADDN(2, 1, 512, 512, 3); } // namespace test } // namespace ops diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index c9e6db59..fe635592 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_BatchNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index 966b039f..5963ee4b 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -60,8 +60,8 @@ class BatchNormOp : public Operator { kernels::BatchNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index f0796606..648ddfca 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -95,34 +95,34 @@ void BatchNorm( } } // namespace -#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BatchNorm(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BatchNorm(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_BATCH_NORM(N, C, H, W) \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_BATCH_NORM(N, C, H, W) \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); -BM_BATCH_NORM(1, 1, 512, 512); -BM_BATCH_NORM(1, 3, 128, 128); -BM_BATCH_NORM(1, 3, 512, 512); -BM_BATCH_NORM(1, 32, 112, 112); -BM_BATCH_NORM(1, 64, 256, 256); -BM_BATCH_NORM(1, 64, 512, 512); -BM_BATCH_NORM(1, 128, 56, 56); -BM_BATCH_NORM(1, 128, 256, 256); -BM_BATCH_NORM(1, 256, 14, 14); -BM_BATCH_NORM(1, 512, 14, 14); -BM_BATCH_NORM(1, 1024, 7, 7); -BM_BATCH_NORM(32, 1, 256, 256); -BM_BATCH_NORM(32, 3, 256, 256); +MACE_BM_BATCH_NORM(1, 1, 512, 512); +MACE_BM_BATCH_NORM(1, 3, 128, 128); +MACE_BM_BATCH_NORM(1, 3, 512, 512); +MACE_BM_BATCH_NORM(1, 32, 112, 112); +MACE_BM_BATCH_NORM(1, 64, 256, 256); +MACE_BM_BATCH_NORM(1, 64, 512, 512); +MACE_BM_BATCH_NORM(1, 128, 56, 56); +MACE_BM_BATCH_NORM(1, 128, 256, 256); +MACE_BM_BATCH_NORM(1, 256, 14, 14); +MACE_BM_BATCH_NORM(1, 512, 14, 14); +MACE_BM_BATCH_NORM(1, 1024, 7, 7); +MACE_BM_BATCH_NORM(32, 1, 256, 256); +MACE_BM_BATCH_NORM(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index e5c137e0..50bc84ed 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -18,22 +18,22 @@ namespace mace { namespace ops { void Register_BatchToSpaceND(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index b95d0c33..4f688dc4 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator { kernels::SpaceToBatchFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index fdb73317..c6b3e25a 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -64,25 +64,26 @@ void BMBatchToSpace( } } // namespace -#define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \ - static void \ - BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMBatchToSpace(iters, N, C, H, W, ARG); \ - } \ - BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) +#define MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \ + static void \ + MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMBatchToSpace(iters, N, C, H, W, ARG); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) -#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); +#define MACE_BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ + MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ + MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); -BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); -BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); -BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); -BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); +MACE_BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); +MACE_BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); +MACE_BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); +MACE_BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index dd4e20f4..deb67368 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_BiasAdd(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h index f69f18e6..1f242253 100644 --- a/mace/ops/bias_add.h +++ b/mace/ops/bias_add.h @@ -46,8 +46,8 @@ class BiasAddOp : public Operator { kernels::BiasAddFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 851c8a17..c0f6ad72 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BiasAdd(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BiasAdd(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_BIAS_ADD(N, C, H, W) \ - BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ - BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ - BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_BIAS_ADD(N, C, H, W) \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); -BM_BIAS_ADD(1, 1, 512, 512); -BM_BIAS_ADD(1, 3, 128, 128); -BM_BIAS_ADD(1, 3, 512, 512); -BM_BIAS_ADD(1, 32, 112, 112); -BM_BIAS_ADD(1, 64, 256, 256); -BM_BIAS_ADD(1, 64, 512, 512); -BM_BIAS_ADD(1, 128, 56, 56); -BM_BIAS_ADD(1, 128, 256, 256); -BM_BIAS_ADD(1, 256, 14, 14); -BM_BIAS_ADD(1, 512, 14, 14); -BM_BIAS_ADD(1, 1024, 7, 7); -BM_BIAS_ADD(32, 1, 256, 256); -BM_BIAS_ADD(32, 3, 256, 256); +MACE_BM_BIAS_ADD(1, 1, 512, 512); +MACE_BM_BIAS_ADD(1, 3, 128, 128); +MACE_BM_BIAS_ADD(1, 3, 512, 512); +MACE_BM_BIAS_ADD(1, 32, 112, 112); +MACE_BM_BIAS_ADD(1, 64, 256, 256); +MACE_BM_BIAS_ADD(1, 64, 512, 512); +MACE_BM_BIAS_ADD(1, 128, 56, 56); +MACE_BM_BIAS_ADD(1, 128, 256, 256); +MACE_BM_BIAS_ADD(1, 256, 14, 14); +MACE_BM_BIAS_ADD(1, 512, 14, 14); +MACE_BM_BIAS_ADD(1, 1024, 7, 7); +MACE_BM_BIAS_ADD(32, 1, 256, 256); +MACE_BM_BIAS_ADD(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc index 5a567d74..04cb9b82 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_to_image.cc @@ -18,17 +18,17 @@ namespace mace { namespace ops { void Register_BufferToImage(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferToImageOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BufferToImageOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferToImageOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BufferToImageOp); } } // namespace ops diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index ae8de696..6d1d0395 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -42,8 +42,8 @@ class BufferToImageOp : public Operator { kernels::BufferToImageFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 3ef8ce78..7d94c525 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -54,36 +54,36 @@ void FilterBufferToImage(int iters, } } // namespace -#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ - static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ +#define MACE_BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ + static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * O * I * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FilterBufferToImage(iters, O, I, H, W); \ } \ - BENCHMARK(BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_B2I(O, I, H, W) \ - BM_B2I_MACRO(O, I, H, W, float, GPU); \ - BM_B2I_MACRO(O, I, H, W, half, GPU); +#define MACE_BM_B2I(O, I, H, W) \ + MACE_BM_B2I_MACRO(O, I, H, W, float, GPU); \ + MACE_BM_B2I_MACRO(O, I, H, W, half, GPU); -BM_B2I(5, 3, 3, 3); -BM_B2I(5, 3, 7, 7); -BM_B2I(32, 16, 1, 1); -BM_B2I(32, 16, 3, 3); -BM_B2I(32, 16, 5, 5); -BM_B2I(32, 16, 7, 7); -BM_B2I(64, 32, 1, 1); -BM_B2I(64, 32, 3, 3); -BM_B2I(64, 32, 5, 5); -BM_B2I(64, 32, 7, 7); -BM_B2I(128, 64, 1, 1); -BM_B2I(128, 64, 3, 3); -BM_B2I(128, 32, 1, 1); -BM_B2I(128, 32, 3, 3); -BM_B2I(256, 32, 1, 1); -BM_B2I(256, 32, 3, 3); +MACE_BM_B2I(5, 3, 3, 3); +MACE_BM_B2I(5, 3, 7, 7); +MACE_BM_B2I(32, 16, 1, 1); +MACE_BM_B2I(32, 16, 3, 3); +MACE_BM_B2I(32, 16, 5, 5); +MACE_BM_B2I(32, 16, 7, 7); +MACE_BM_B2I(64, 32, 1, 1); +MACE_BM_B2I(64, 32, 3, 3); +MACE_BM_B2I(64, 32, 5, 5); +MACE_BM_B2I(64, 32, 7, 7); +MACE_BM_B2I(128, 64, 1, 1); +MACE_BM_B2I(128, 64, 3, 3); +MACE_BM_B2I(128, 32, 1, 1); +MACE_BM_B2I(128, 32, 3, 3); +MACE_BM_B2I(256, 32, 1, 1); +MACE_BM_B2I(256, 32, 3, 3); } // namespace test } // namespace ops diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index f7d23e31..f3311be6 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_ChannelShuffle(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h index 246b286b..bd9234c1 100644 --- a/mace/ops/channel_shuffle.h +++ b/mace/ops/channel_shuffle.h @@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator { protected: const int group_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::ChannelShuffleFunctor functor_; diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index 205d74e5..49f494c8 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -69,25 +69,26 @@ void ChannelShuffle( } } // namespace -#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ChannelShuffle(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ChannelShuffle(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); -BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); -BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); -BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8); } // namespace test } // namespace ops diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 7f6f6fd8..0275d497 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Concat(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ConcatOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ConcatOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ConcatOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/concat.h b/mace/ops/concat.h index 51477105..be763714 100644 --- a/mace/ops/concat.h +++ b/mace/ops/concat.h @@ -51,7 +51,7 @@ class ConcatOp : public Operator { kernels::ConcatFunctor functor_; private: - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index af6f2943..faf784c5 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) { } } // namespace -#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ - static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \ - ConcatHelper(iters, DIM0, DIM1); \ - } \ - BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1) +#define MACE_BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ + static void MACE_BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \ + ConcatHelper(iters, DIM0, DIM1); \ + } \ + MACE_BENCHMARK(MACE_BM_CONCAT_CPU_##DIM0##_##DIM1) -BM_CONCAT_CPU_MACRO(0, 1000); -BM_CONCAT_CPU_MACRO(0, 100000); -BM_CONCAT_CPU_MACRO(1, 1000); -BM_CONCAT_CPU_MACRO(1, 100000); +MACE_BM_CONCAT_CPU_MACRO(0, 1000); +MACE_BM_CONCAT_CPU_MACRO(0, 100000); +MACE_BM_CONCAT_CPU_MACRO(1, 1000); +MACE_BM_CONCAT_CPU_MACRO(1, 100000); namespace { template @@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters, } } // namespace -#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ - static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \ - std::vector shape = {N, H, W, C}; \ - OpenclConcatHelper(iters, shape, shape, 3); \ - } \ - BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) - -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float); - -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half); +#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ + static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\ + std::vector shape = {N, H, W, C}; \ + OpenclConcatHelper(iters, shape, shape, 3); \ + } \ + MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) + +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float); + +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half); } // namespace test } // namespace ops diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index e5f01d26..29d3ac71 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Conv2D(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 5e691268..b15045cd 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase { kernels::Conv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index c0e5e28d..63e8869a 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -105,11 +105,11 @@ void Conv2d(int iters, // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // in-order. -#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \ - DEVICE) \ +#define MACE_BM_CONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, DEVICE) \ static void \ - BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ - ##_##P##_##OC##_##TYPE##_##DEVICE( \ + MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\ + DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t pad_h = 0, pad_w = 0; \ @@ -128,54 +128,53 @@ void Conv2d(int iters, Conv2d(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ mace::Padding::P, OC); \ } \ - BENCHMARK( \ - BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ - ##_##P##_##OC##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\ + DILATION##_##P##_##OC##_##TYPE##_##DEVICE) -#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); +#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); // Filter sizes and data alignments -BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); -BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); +MACE_BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); // 3 channels input -BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); -BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); -BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); +MACE_BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); +MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); +MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); // Dilations -BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); -BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); +MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); +MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); // MobileNet -BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); -BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); +MACE_BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); -BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); -BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); +MACE_BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); +MACE_BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); -BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192); -BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192); -BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); - -BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); -BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); -BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 4666296d..342e27aa 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Deconv2D(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index c73bea86..1c52e10c 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase { kernels::Deconv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index af8d3b87..2a414e3c 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -94,11 +94,11 @@ static void Deconv2d(int iters, // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // in-order. -#define BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, \ - DEVICE) \ +#define MACE_BM_DECONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, DEVICE) \ static void \ - BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW\ - ##_##P##_##OC##_##TYPE##_##DEVICE( \ + MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\ + ##OW##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t oh = OH; \ @@ -110,30 +110,30 @@ static void Deconv2d(int iters, Deconv2d(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ mace::Padding::P, OC); \ } \ - BENCHMARK( \ - BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW##\ - _##P##_##OC##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\ + ##OW##_##P##_##OC##_##TYPE##_##DEVICE) // TODO(liutuo): add cpu benchmark when optimized. -#define BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ - BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ - BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); +#define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ + MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ + MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); -BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); -BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); +MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); +MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); -BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); -BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); -BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); -BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); -BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); +MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); +MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); +MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); +MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); -BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); +MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); -BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); -BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); -BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); -BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); +MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); +MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); +MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); } // namespace test } // namespace ops diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 2f0e38c4..682a6770 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_DepthToSpace(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 0df9ecbd..4be3f2a0 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator { protected: const int block_size_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::DepthToSpaceOpFunctor functor_; diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index bf05f692..43115167 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -69,25 +69,26 @@ void DepthToSpace( } } // namespace -#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthToSpace(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthToSpace(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_DEPTH_TO_SPACE(N, C, H, W, G) \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); -BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); -BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); -BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); } // namespace test } // namespace ops diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index fc0205db..cdb53595 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index 2d6b1388..2762aea5 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase { kernels::DepthwiseConv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index acee2265..ea847fd1 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters, } } // namespace -#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \ - DEVICE) \ - static void \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ - ##P##_##M##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t dilation = 1; \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - int64_t pad_h = 0, pad_w = 0; \ - if (P == SAME) { \ - pad_h = KH / 2; \ - pad_w = KW / 2; \ - } \ - int64_t oh = \ - (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ - int64_t ow = \ - (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, M); \ - } \ - BENCHMARK( \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ - ##P##_##M##_##TYPE##_##DEVICE) +#define MACE_BM_DEPTHWISE_CONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, P, M, TYPE, DEVICE) \ + static void \ + MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\ + ##_##P##_##M##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t dilation = 1; \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + int64_t pad_h = 0, pad_w = 0; \ + if (P == SAME) { \ + pad_h = KH / 2; \ + pad_w = KW / 2; \ + } \ + int64_t oh = \ + (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ + int64_t ow = \ + (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ + const int64_t macc = \ + static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ + mace::Padding::P, M); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\ + ##_##P##_##M##_##TYPE##_##DEVICE) -#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); +#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); -BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); } // namespace test diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 427716ea..bbb21435 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Eltwise(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 95c48b78..efa87dd9 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -43,7 +43,7 @@ class EltwiseOp : public Operator { kernels::EltwiseFunctor functor_; private: - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 55308069..d12c97b3 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -76,30 +76,31 @@ void EltwiseBenchmark( } } // namespace -#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ - static void \ - BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - EltwiseBenchmark( \ - iters, static_cast(ELT_TYPE), N, H, W, C); \ - } \ - BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) +#define MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + EltwiseBenchmark( \ + iters, static_cast(ELT_TYPE), N, H, W, C); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); +#define MACE_BM_ELTWISE(ELT_TYPE, N, H, W, C) \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); -BM_ELTWISE(2, 1, 128, 128, 32); -BM_ELTWISE(2, 1, 240, 240, 256); -BM_ELTWISE(2, 1, 256, 256, 32); -BM_ELTWISE(0, 1, 128, 128, 32); -BM_ELTWISE(0, 1, 240, 240, 256); -BM_ELTWISE(5, 1, 128, 128, 32); -BM_ELTWISE(5, 1, 240, 240, 256); +MACE_BM_ELTWISE(2, 1, 128, 128, 32); +MACE_BM_ELTWISE(2, 1, 240, 240, 256); +MACE_BM_ELTWISE(2, 1, 256, 256, 32); +MACE_BM_ELTWISE(0, 1, 128, 128, 32); +MACE_BM_ELTWISE(0, 1, 240, 240, 256); +MACE_BM_ELTWISE(5, 1, 128, 128, 32); +MACE_BM_ELTWISE(5, 1, 240, 240, 256); } // namespace test } // namespace ops diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc index 6c46195d..ace0b857 100644 --- a/mace/ops/folded_batch_norm.cc +++ b/mace/ops/folded_batch_norm.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h index c9047ef8..03543c1c 100644 --- a/mace/ops/folded_batch_norm.h +++ b/mace/ops/folded_batch_norm.h @@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator { kernels::BatchNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index d747916c..3147a598 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_FullyConnected(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h index c2539169..a4b17d4e 100644 --- a/mace/ops/fully_connected.h +++ b/mace/ops/fully_connected.h @@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator { kernels::FullyConnectedFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index fd1210b6..021b6396 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -82,28 +82,28 @@ void FCBenchmark( } } // namespace -#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ - static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t macc = \ - static_cast(iters) * N * C * H * W * OC + OC; \ - const int64_t tot = \ - static_cast(iters) * (N + OC) * C * H * W + OC; \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - FCBenchmark(iters, N, H, W, C, OC); \ - } \ - BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE) +#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ + static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t macc = \ + static_cast(iters) * N * C * H * W * OC + OC; \ + const int64_t tot = \ + static_cast(iters) * (N + OC) * C * H * W + OC; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + FCBenchmark(iters, N, H, W, C, OC); \ + } \ + MACE_BENCHMARK(MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE) -#define BM_FC(N, H, W, C, OC) \ - BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ - BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ - BM_FC_MACRO(N, H, W, C, OC, half, GPU); +#define MACE_BM_FC(N, H, W, C, OC) \ + MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ + MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ + MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); -BM_FC(1, 16, 16, 32, 32); -BM_FC(1, 8, 8, 32, 1000); -BM_FC(1, 2, 2, 512, 2); -BM_FC(1, 7, 7, 512, 2048); +MACE_BM_FC(1, 16, 16, 32, 32); +MACE_BM_FC(1, 8, 8, 32, 1000); +MACE_BM_FC(1, 2, 2, 512, 2); +MACE_BM_FC(1, 7, 7, 512, 2048); } // namespace test } // namespace ops diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/image_to_buffer.cc index 02bcc5f7..168f75b6 100644 --- a/mace/ops/image_to_buffer.cc +++ b/mace/ops/image_to_buffer.cc @@ -18,17 +18,17 @@ namespace mace { namespace ops { void Register_ImageToBuffer(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ImageToBufferOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ImageToBufferOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ImageToBufferOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ImageToBufferOp); } } // namespace ops diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index 1365e1a8..9d742033 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator { kernels::ImageToBufferFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc index 25a58aa1..8517c014 100644 --- a/mace/ops/local_response_norm.cc +++ b/mace/ops/local_response_norm.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_LocalResponseNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - LocalResponseNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + LocalResponseNormOp); } } // namespace ops diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h index 502a3404..6f2bd691 100644 --- a/mace/ops/local_response_norm.h +++ b/mace/ops/local_response_norm.h @@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator { kernels::LocalResponseNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc index 689eea72..ee15c3e0 100644 --- a/mace/ops/local_response_norm_benchmark.cc +++ b/mace/ops/local_response_norm_benchmark.cc @@ -55,32 +55,34 @@ static void LocalResponseNorm( net.Sync(); } -#define BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ - int iters) { \ +#define MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void \ + MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ LocalResponseNorm(iters, N, C, H, W); \ } \ - BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_LOCAL_RESPONSE_NORM(N, C, H, W) \ - BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU); +#define MACE_BM_LOCAL_RESPONSE_NORM(N, C, H, W) \ + MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU); -BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128); -BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112); -BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256); -BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56); -BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256); -BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14); -BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14); -BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7); -BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256); -BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128); +MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112); +MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56); +MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14); +MACE_BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14); +MACE_BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7); +MACE_BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index b65bf484..fa342659 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_MatMul(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + MatMulOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + MatMulOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + MatMulOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index cc9b86da..382fdf7c 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -67,27 +67,28 @@ void MatMulBenchmark( } } // namespace -#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ - static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ + static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t macc = static_cast(iters) * N * C * H * W; \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulBenchmark(iters, N, H, C, W); \ } \ - BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) -#define BM_MATMUL(N, H, C, W) \ - BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ - BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ - BM_MATMUL_MACRO(N, H, C, W, half, GPU); +#define MACE_BM_MATMUL(N, H, C, W) \ + MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ + MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ + MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU); -BM_MATMUL(16, 32, 128, 49); -BM_MATMUL(16, 32, 128, 961); -BM_MATMUL(16, 32, 128, 3969); -BM_MATMUL(16, 128, 128, 49); -BM_MATMUL(16, 128, 128, 961); -BM_MATMUL(16, 128, 128, 3969); +MACE_BM_MATMUL(16, 32, 128, 49); +MACE_BM_MATMUL(16, 32, 128, 961); +MACE_BM_MATMUL(16, 32, 128, 3969); +MACE_BM_MATMUL(16, 128, 128, 49); +MACE_BM_MATMUL(16, 128, 128, 961); +MACE_BM_MATMUL(16, 128, 128, 3969); } // namespace test } // namespace ops diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 8bcf2028..6875de6a 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -18,23 +18,23 @@ namespace mace { namespace ops { void Register_Pad(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PadOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PadOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index 4be091db..c5172f8c 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height, } } // namespace -#define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \ - static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - Pad(iters, N, H, W, C, PAD); \ - } \ - BENCHMARK(BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE) +#define MACE_BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \ + static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + Pad(iters, N, H, W, C, PAD); \ + } \ + MACE_BENCHMARK(MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE) -#define BM_PAD(N, H, W, C, PAD) \ - BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ - BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ - BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); +#define MACE_BM_PAD(N, H, W, C, PAD) \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); -BM_PAD(1, 512, 512, 1, 2); -BM_PAD(1, 112, 112, 64, 1); -BM_PAD(1, 256, 256, 32, 2); -BM_PAD(1, 512, 512, 16, 2); +MACE_BM_PAD(1, 512, 512, 1, 2); +MACE_BM_PAD(1, 112, 112, 64, 1); +MACE_BM_PAD(1, 256, 256, 32, 2); +MACE_BM_PAD(1, 512, 512, 16, 2); } // namespace test } // namespace ops diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index ac15bb1b..25cd44aa 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Pooling(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PoolingOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PoolingOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PoolingOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index af46c0cd..fac4e1dd 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase { PoolingType pooling_type_; kernels::PoolingFunctor functor_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 1f767c22..d0da9b47 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -87,29 +87,29 @@ void Pooling(int iters, } } // namespace -#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \ - static void \ - BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ - ##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(float))); \ - Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ - PoolingType::PO); \ - } \ - BENCHMARK( \ - BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ +#define MACE_BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \ + static void \ + MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ + ##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(float))); \ + Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ + PoolingType::PO); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ ##DEVICE) -#define BM_POOLING(N, C, H, W, K, S, PA, PO) \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); +#define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \ + MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ + MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); -BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX); } // namespace test } // namespace ops diff --git a/mace/ops/proposal.cc b/mace/ops/proposal.cc index 98afe966..4558bbb3 100644 --- a/mace/ops/proposal.cc +++ b/mace/ops/proposal.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Proposal(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ProposalOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ProposalOp); } } // namespace ops diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h index 36021add..1afabb8f 100644 --- a/mace/ops/proposal.h +++ b/mace/ops/proposal.h @@ -49,8 +49,8 @@ class ProposalOp : public Operator { kernels::ProposalFunctor functor_; protected: - OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); - OP_OUTPUT_TAGS(ROIS); + MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); + MACE_OP_OUTPUT_TAGS(ROIS); }; } // namespace ops diff --git a/mace/ops/psroi_align.cc b/mace/ops/psroi_align.cc index d21db0cc..17a02d61 100644 --- a/mace/ops/psroi_align.cc +++ b/mace/ops/psroi_align.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_PSROIAlign(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PSROIAlignOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PSROIAlignOp); } } // namespace ops diff --git a/mace/ops/psroi_align.h b/mace/ops/psroi_align.h index 82bd0c44..b7595682 100644 --- a/mace/ops/psroi_align.h +++ b/mace/ops/psroi_align.h @@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator { kernels::PSROIAlignFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, ROIS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, ROIS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc index 5121552e..ffd4cd78 100644 --- a/mace/ops/quantize.cc +++ b/mace/ops/quantize.cc @@ -18,27 +18,27 @@ namespace mace { namespace ops { void Register_Quantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - QuantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + QuantizeOp); } void Register_Dequantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DequantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DequantizeOp); } void Register_Requantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - RequantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + RequantizeOp); } } // namespace ops diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h index 212d3593..bf0d1534 100644 --- a/mace/ops/quantize.h +++ b/mace/ops/quantize.h @@ -50,8 +50,8 @@ class QuantizeOp : public Operator { kernels::QuantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); - OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); }; template @@ -79,8 +79,8 @@ class DequantizeOp : public Operator { kernels::DequantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; template @@ -131,8 +131,8 @@ class RequantizeOp : public Operator { kernels::RequantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX); - OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); }; } // namespace ops diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index 708e7e09..4390c520 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Reshape(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ReshapeOp); } } // namespace ops diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h index 371e3b40..fe1df988 100644 --- a/mace/ops/reshape.h +++ b/mace/ops/reshape.h @@ -69,8 +69,8 @@ class ReshapeOp : public Operator { kernels::ReshapeFunctor functor_; private: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 65a84926..e18d7038 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_ResizeBilinear(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index ad954561..1f217801 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters, } } // namespace -#define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ - static void \ - BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ - ##DEVICE( \ - int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ - const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ - } \ - BENCHMARK( \ - BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ +#define MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ + static void \ + MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ + ##DEVICE( \ + int iters) { \ + const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE) -#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); +#define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); -BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); - -BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15); -BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30); -BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60); -BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480); -BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480); -BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016); +MACE_BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15); +MACE_BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30); +MACE_BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60); +MACE_BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016); } // namespace test } // namespace ops diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index a146e00b..a9b1c9bd 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Slice(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SliceOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SliceOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SliceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/slice.h b/mace/ops/slice.h index 567707ff..d917f132 100644 --- a/mace/ops/slice.h +++ b/mace/ops/slice.h @@ -46,7 +46,7 @@ class SliceOp : public Operator { kernels::SliceFunctor functor_; private: - OP_INPUT_TAGS(INPUT); + MACE_OP_INPUT_TAGS(INPUT); }; } // namespace ops diff --git a/mace/ops/slice_benchmark.cc b/mace/ops/slice_benchmark.cc index b05138ae..c02dbf5c 100644 --- a/mace/ops/slice_benchmark.cc +++ b/mace/ops/slice_benchmark.cc @@ -73,26 +73,28 @@ void BMSliceHelper(int iters, } } // namespace -#define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \ +#define MACE_BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \ static void \ - BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \ + MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSliceHelper(iters, {N, H, W, C}, NO); \ } \ - BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE) -#define BM_SLICE(N, H, W, C, NO) \ - BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ - BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ - BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); +#define MACE_BM_SLICE(N, H, W, C, NO) \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); -BM_SLICE(1, 32, 32, 32, 2); -BM_SLICE(1, 32, 32, 128, 2); -BM_SLICE(1, 32, 32, 256, 2); -BM_SLICE(1, 128, 128, 32, 2); -BM_SLICE(1, 128, 128, 128, 2); +MACE_BM_SLICE(1, 32, 32, 32, 2); +MACE_BM_SLICE(1, 32, 32, 128, 2); +MACE_BM_SLICE(1, 32, 32, 256, 2); +MACE_BM_SLICE(1, 128, 128, 32, 2); +MACE_BM_SLICE(1, 128, 128, 128, 2); } // namespace test } // namespace ops diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 2e2e9e7f..eff2b415 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Softmax(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h index a4459e52..c6b81d9e 100644 --- a/mace/ops/softmax.h +++ b/mace/ops/softmax.h @@ -40,8 +40,8 @@ class SoftmaxOp : public Operator { kernels::SoftmaxFunctor functor_; protected: - OP_INPUT_TAGS(LOGITS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(LOGITS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index b62eadcb..2e9aed7c 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -70,26 +70,26 @@ void SoftmaxBenchmark( } } // namespace -#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SoftmaxBenchmark(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SoftmaxBenchmark(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_SOFTMAX(N, C, H, W) \ - BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ - BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); -BM_SOFTMAX(1, 2, 512, 512); -BM_SOFTMAX(1, 3, 512, 512); -BM_SOFTMAX(1, 4, 512, 512); -BM_SOFTMAX(1, 10, 256, 256); -BM_SOFTMAX(1, 1024, 7, 7); +MACE_BM_SOFTMAX(1, 2, 512, 512); +MACE_BM_SOFTMAX(1, 3, 512, 512); +MACE_BM_SOFTMAX(1, 4, 512, 512); +MACE_BM_SOFTMAX(1, 10, 256, 256); +MACE_BM_SOFTMAX(1, 1024, 7, 7); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index aac19327..ca905e78 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -18,23 +18,23 @@ namespace mace { namespace ops { void Register_SpaceToBatchND(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index ebeb7975..faef4b57 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator { kernels::SpaceToBatchFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index 524a32ce..272c487c 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -64,27 +64,27 @@ void BMSpaceToBatch( } } // namespace -#define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \ - static void \ - BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ - } \ - BENCHMARK( \ - BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) +#define MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \ + static void \ + MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) -#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); +#define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); -BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); +MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 2932f8ea..18072265 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_SpaceToDepth(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 33c3d1a5..cece41f8 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator { } protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::DepthToSpaceOpFunctor functor_; diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 66c90b64..97d3cb03 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -69,25 +69,26 @@ void SpaceToDepth( } } // namespace -#define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SpaceToDepth(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SpaceToDepth(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_SPACE_TO_DEPTH(N, C, H, W, G) \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); -BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); -BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); -BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); } // namespace test } // namespace ops diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc index 2ed6103d..9571fcbf 100644 --- a/mace/ops/transpose.cc +++ b/mace/ops/transpose.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Transpose(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - TransposeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + TransposeOp); } } // namespace ops diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h index 877078d0..5fb497ac 100644 --- a/mace/ops/transpose.h +++ b/mace/ops/transpose.h @@ -50,8 +50,8 @@ class TransposeOp : public Operator { std::vector dims_; kernels::TransposeFunctor functor_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace mace diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc index 24e6f2ff..aaf3faaa 100644 --- a/mace/ops/transpose_benchmark.cc +++ b/mace/ops/transpose_benchmark.cc @@ -55,41 +55,41 @@ void TransposeBenchmark(int iters, } } // namespace -#define BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \ - static void BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ +#define MACE_BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \ + static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {H, W}, {1, 0}); \ } \ - BENCHMARK(BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE) - -#define BM_TRANSPOSE2D(H, W) \ - BM_TRANSPOSE2D_MACRO(H, W, float, CPU); - -#define BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \ - static void \ - BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ - } \ - BENCHMARK( \ - BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE) - -#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \ - BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU); - - -BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2); -BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1); -BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2); -BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1); -BM_TRANSPOSE2D(128, 128); -BM_TRANSPOSE2D(512, 512); + MACE_BENCHMARK(MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_TRANSPOSE2D(H, W) \ + MACE_BM_TRANSPOSE2D_MACRO(H, W, float, CPU); + +#define MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \ + static void \ + MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ + DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ + DEVICE) + +#define MACE_BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \ + MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU); + +MACE_BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2); +MACE_BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1); +MACE_BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2); +MACE_BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1); +MACE_BM_TRANSPOSE2D(128, 128); +MACE_BM_TRANSPOSE2D(512, 512); } // namespace test } // namespace ops diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc index 763f1f16..f84b69a2 100644 --- a/mace/ops/winograd_inverse_transform.cc +++ b/mace/ops/winograd_inverse_transform.cc @@ -19,17 +19,17 @@ namespace ops { void Register_WinogradInverseTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); #else MACE_UNUSED(op_registry); #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h index 71807b31..d08ce0f6 100644 --- a/mace/ops/winograd_inverse_transform.h +++ b/mace/ops/winograd_inverse_transform.h @@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator { kernels::WinogradInverseTransformFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc index 8181cba9..24f82255 100644 --- a/mace/ops/winograd_transform.cc +++ b/mace/ops/winograd_transform.cc @@ -19,17 +19,17 @@ namespace ops { void Register_WinogradTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); #else MACE_UNUSED(op_registry); #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h index e9a3afbe..90bb5501 100644 --- a/mace/ops/winograd_transform.h +++ b/mace/ops/winograd_transform.h @@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator { kernels::WinogradTransformFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index bf33332b..658b0ea2 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -51,22 +51,24 @@ void BMWinogradTransform( } } // namespace -#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ - static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ +#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradTransform(iters, N, H, W, C); \ } \ - BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \ + MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); -BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); -BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); -BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); namespace { template @@ -103,24 +105,24 @@ void BMWinogradInverseTransform( } } // namespace -#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ +#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ static void \ - BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradInverseTransform(iters, N, H, W, C); \ } \ - BENCHMARK( \ - BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); } // namespace test } // namespace ops -- GitLab