diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index 54a9efc9a4b6fd9587210e4588236bb487d46d5b..60fb38f7d71895db95ccd1ec88a765b5fecfc5cc 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) #undef MACE_GET_REPEATED_ARGUMENT_FUNC - } // namespace mace diff --git a/mace/core/operator.h b/mace/core/operator.h index 06a20d882bca3a91e03299a787179378b5fe96df..3a2285d97d6948fbd6fe1b9a23d8ace9585a8c27 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -124,20 +124,19 @@ class Operator : public OperatorBase { ~Operator() noexcept override {} }; -// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of -// the -// operator's inputs and outputs, in order to avoid confusion. For example, for -// a fully convolution layer that has input, weight and bias, you can define its -// input tags as: -// OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); +// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the +// indices of the operator's inputs and outputs, in order to avoid confusion. +// For example, for a fully convolution layer that has input, weight and bias, +// you can define its input tags as: +// MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); // And in the code, instead of doing // auto& weight = Input(1); // you can now do // auto& weight = Input(WEIGHT); // to make it more clear. -#define OP_INPUT_TAGS(first_input, ...) \ +#define MACE_OP_INPUT_TAGS(first_input, ...) \ enum _InputTags { first_input = 0, __VA_ARGS__ } -#define OP_OUTPUT_TAGS(first_input, ...) \ +#define MACE_OP_OUTPUT_TAGS(first_input, ...) \ enum _OutputTags { first_input = 0, __VA_ARGS__ } class OpKeyBuilder { @@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry, const OperatorDef &, Workspace *); -#define REGISTER_OPERATOR(op_registry, name, ...) \ +#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc index 7293a0aa42c5079f466a24343ed0c03ae76bd020..5443969510f2fc566d4763ba82ab95cd381fcd20 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc @@ -32,7 +32,7 @@ inline int64_t NowMicros() { namespace mace { -#define MAX_NODE 2048 +#define MACE_MAX_NODE 2048 enum { NN_GRAPH_PERFEVENT_CYCLES = 0, @@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() { return hexagon_nn_teardown(nn_id_) == 0; } -#define PRINT_BUFSIZE (2 * 1024 * 1024) +#define MACE_PRINT_BUFSIZE (2 * 1024 * 1024) void HexagonControlWrapper::PrintLog() { char *buf; - if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; + if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast(buf), - PRINT_BUFSIZE) == 0, + MACE_PRINT_BUFSIZE) == 0, "print log error"); LOG(INFO) << std::string(buf); delete[] buf; @@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() { void HexagonControlWrapper::PrintGraph() { LOG(INFO) << "Print Graph"; char *buf; - if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; + if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast(buf), - PRINT_BUFSIZE) == 0, + MACE_PRINT_BUFSIZE) == 0, "print graph error"); LOG(INFO) << std::string(buf); delete[] buf; @@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) { void HexagonControlWrapper::GetPerfInfo() { LOG(INFO) << "Get perf info"; - std::vector perf_info(MAX_NODE); + std::vector perf_info(MACE_MAX_NODE); unsigned int n_items = 0; - MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, + MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MACE_MAX_NODE, &n_items) == 0, "get perf info error"); @@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() { perf_info[i].counter_lo) * 1.0f / perf_info[i].executions; - char node_type_buf[MAX_NODE]; - hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE); + char node_type_buf[MACE_MAX_NODE]; + hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MACE_MAX_NODE); std::string node_type(node_type_buf); LOG(INFO) << "node id: " << perf_info[i].node_id << ", node type: " << node_type diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_nn_ops.h index deaba66543205864847ab2f90abc145137d48561..7f800550a2d0e3c6409dbe7edc43c940488ea40d 100644 --- a/mace/core/runtime/hexagon/hexagon_nn_ops.h +++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h @@ -22,7 +22,7 @@ namespace mace { -#define OP_INVALID -1 +#define MACE_OP_INVALID -1 typedef enum op_type_enum { #define DEF_OP(NAME, ...) OP_##NAME, @@ -48,7 +48,7 @@ class OpMap { return op_map_[op_type]; } else { LOG(ERROR) << "DSP unsupoorted op type: " << op_type; - return OP_INVALID; + return MACE_OP_INVALID; } } diff --git a/mace/core/tensor.h b/mace/core/tensor.h index f2f9763a1caee9c7749adba1208eff5eab595755..0090e9580ccced3776de6bfdd6d2d2e01e9446d6 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -30,9 +30,9 @@ #ifdef MACE_ENABLE_NEON // Avoid over-bound accessing memory -#define EXTRA_BUFFER_PAD_SIZE 64 +#define MACE_EXTRA_BUFFER_PAD_SIZE 64 #else -#define EXTRA_BUFFER_PAD_SIZE 0 +#define MACE_EXTRA_BUFFER_PAD_SIZE 0 #endif namespace mace { @@ -210,16 +210,16 @@ class Tensor { image_shape_.clear(); if (buffer_ != nullptr) { MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); - if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { + if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " - << raw_size() + EXTRA_BUFFER_PAD_SIZE; - return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE); + << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; + return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); } return MaceStatus::MACE_SUCCESS; } else { MACE_CHECK(is_buffer_owner_); buffer_ = new Buffer(allocator_); - return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE); + return buffer_->Allocate(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); } } diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index 54edb681bcd5a80ab8534afc653055f7e5419761..96b883a4e58af23838132f548611b969c5812558 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -21,8 +21,8 @@ #include #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c -#define BENCHMARK(n) \ - static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \ +#define MACE_BENCHMARK(n) \ + static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \ __benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n))) namespace mace { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 02354d504f9daad6e49700e23a02674bc5357d0f..3d2566e6007b1e0d918fa6e0b5e3d2c85b9c5138 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::unique_ptr tensor_buf( new Buffer(GetDeviceAllocator(device_type))); MaceStatus status = tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE); + mem_block.x() * GetEnumTypeSize(dtype) + + MACE_EXTRA_BUFFER_PAD_SIZE); if (status != MaceStatus::MACE_SUCCESS) { return status; } diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc index 61672bd435ef0d49790d0f55f69e4be5355d8a12..7a32a2910f3129724516c82c6bc178a602403853 100644 --- a/mace/kernels/arm/conv_2d_neon_5x5.cc +++ b/mace/kernels/arm/conv_2d_neon_5x5.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { -#define Conv2dNeonK5x5SnLoadCalc4 \ +#define MACE_Conv2dNeonK5x5SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf10, vf20, vf30; \ float32x2_t vf01, vf11, vf21, vf31; \ @@ -62,7 +62,7 @@ namespace kernels { vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); -#define Conv2dNeonK5x5SnLoadCalc1 \ +#define MACE_Conv2dNeonK5x5SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00; \ float32x2_t vf01; \ @@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input, vi2 = vextq_f32(vi0, vi4, 2); vi3 = vextq_f32(vi0, vi4, 3); - Conv2dNeonK5x5SnLoadCalc4; + MACE_Conv2dNeonK5x5SnLoadCalc4; in_offset += in_width; filter_ptr0 += 5; @@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input, vi2 = vextq_f32(vi0, vi4, 2); vi3 = vextq_f32(vi0, vi4, 3); - Conv2dNeonK5x5SnLoadCalc1; + MACE_Conv2dNeonK5x5SnLoadCalc1; in_offset += in_width; filter_ptr0 += 5; diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc index b6c2d5fd897f55af5add7a2b8699618d10b62bfc..8488127b6acb754f8cd66dc0827f240ee8bb9d4f 100644 --- a/mace/kernels/arm/conv_2d_neon_7x7.cc +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -21,7 +21,7 @@ namespace mace { namespace kernels { -#define Conv2dArmv8NeonK7x7SnLoadCalc4 \ +#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ float32x4_t vf10, vf11; \ @@ -72,7 +72,7 @@ namespace kernels { vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#define Conv2dArmv8NeonK7x7SnLoadCalc1 \ +#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ @@ -87,7 +87,7 @@ namespace kernels { vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#define Conv2dArmv7NeonK7x7SnLoadCalc4 \ +#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ float32x4_t vf10, vf11; \ @@ -138,7 +138,7 @@ namespace kernels { vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \ vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#define Conv2dArmv7NeonK7x7SnLoadCalc1 \ +#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \ /* load filter (1 outch x 1 height x 4 width) */ \ float32x4_t vf00, vf01; \ vf00 = vld1q_f32(filter_ptr0); \ @@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input, vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input, vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; @@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; @@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - Conv2dArmv7NeonK7x7SnLoadCalc4; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; @@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input, vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] #if defined(__aarch64__) - Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; #else - Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif in_offset += in_width; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 85db043c2e99657a3d311062c19ecf77e7379a7c..99973a4579067c0ba1bf9b8cd193ee8954ed9bb9 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -465,7 +465,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { padded_input_size = batch * input_channels * (input_height + pad_top + pad_bottom) * (input_width + pad_left + pad_right) * sizeof(float) + - EXTRA_BUFFER_PAD_SIZE; + MACE_EXTRA_BUFFER_PAD_SIZE; total_scratch_size += padded_input_size; } if (extra_output_height != height || extra_output_width != width) { diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index b7a63a91fc91595496a39dfff1c881f4aeb929c1..0d385401d815eb8169e552d36cde52348207aca2 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, // Skip the padded top rows if (padding_same_value) { -#define COPY_INPUT \ +#define MACE_COPY_INPUT \ std::fill(output_data, output_data + padded_left, input[0]); \ output_data += padded_left; \ memcpy(output_data, input, width * sizeof(float)); \ @@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { for (int k = 0; k < padded_top; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; } for (int k = 0; k < height; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; input += width; } input -= width; for (int k = 0; k < padded_bottom; ++k) { - COPY_INPUT; + MACE_COPY_INPUT; } input += width; } } -#undef COPY_INPUT +#undef MACE_COPY_INPUT } else { output_data += padded_top * output_width; for (int i = 0; i < batch; ++i) { diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index 946b77af37d56f25b4c36a675a2d5074997fb6e2..f45c2824509c0abbac078b8523e02d879c9a1e19 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); MACE_CHECK(stride > 0, "stride should > 0."); -#define WIDTH_BLK 5 +#define MACE_WIDTH_BLK 5 const index_t n_strides = (width + stride - 1) / stride; - const index_t width_blocks = ((n_strides + WIDTH_BLK -1)/ WIDTH_BLK) * stride; + const index_t width_blocks = + ((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride; const float stride_r = 1.f / static_cast(stride); const int padding_h = (paddings[0]+1) >> 1; const int padding_w = (paddings[0]+1) >> 1; diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 467a595309c32f26de3ae271040045092252d840..1a6bbe6e01dc9c2ccd8425e65b8357d6757b13ac 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer, class OutOfRangeCheckTest : public ::testing::Test { protected: virtual void SetUp() { - setenv("MACE_OUT_OF_RANGE_CHECK", "1", 1); + setenv("OUT_OF_RANGE_CHECK", "1", 1); } }; diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 0c90bc9e561bf13da75fad0c0b10bf85eedf30bf..37fd8117b83ab511e425e40eb185246a3856a172 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Activation(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ActivationOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ActivationOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ActivationOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index e4ff005a45d2809cd0ded47741c59332965f0128..341b5f713fe86a87b6a3a4c1ee3c556ff8d2ebbf 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELU(N, C, H, W) \ - BM_RELU_MACRO(N, C, H, W, float, CPU); \ - BM_RELU_MACRO(N, C, H, W, float, GPU); \ - BM_RELU_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_RELU(N, C, H, W) \ + MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_RELU_MACRO(N, C, H, W, half, GPU); -BM_RELU(1, 1, 512, 512); -BM_RELU(1, 3, 128, 128); -BM_RELU(1, 3, 512, 512); -BM_RELU(1, 32, 112, 112); -BM_RELU(1, 64, 256, 256); +MACE_BM_RELU(1, 1, 512, 512); +MACE_BM_RELU(1, 3, 128, 128); +MACE_BM_RELU(1, 3, 512, 512); +MACE_BM_RELU(1, 32, 112, 112); +MACE_BM_RELU(1, 64, 256, 256); namespace { template @@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluxBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELUX(N, C, H, W) \ - BM_RELUX_MACRO(N, C, H, W, float, CPU); \ - BM_RELUX_MACRO(N, C, H, W, float, GPU); \ - BM_RELUX_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_RELUX(N, C, H, W) \ + MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_RELUX_MACRO(N, C, H, W, half, GPU); -BM_RELUX(1, 1, 512, 512); -BM_RELUX(1, 3, 128, 128); -BM_RELUX(1, 3, 512, 512); -BM_RELUX(1, 32, 112, 112); -BM_RELUX(1, 64, 256, 256); +MACE_BM_RELUX(1, 1, 512, 512); +MACE_BM_RELUX(1, 3, 128, 128); +MACE_BM_RELUX(1, 3, 512, 512); +MACE_BM_RELUX(1, 32, 112, 112); +MACE_BM_RELUX(1, 64, 256, 256); namespace { template @@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ PreluBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_PRELU(N, C, H, W) \ - BM_PRELU_MACRO(N, C, H, W, float, CPU); \ - BM_PRELU_MACRO(N, C, H, W, float, GPU); \ - BM_PRELU_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_PRELU(N, C, H, W) \ + MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_PRELU_MACRO(N, C, H, W, half, GPU); -BM_PRELU(1, 1, 512, 512); -BM_PRELU(1, 3, 128, 128); -BM_PRELU(1, 3, 512, 512); -BM_PRELU(1, 32, 112, 112); -BM_PRELU(1, 64, 256, 256); +MACE_BM_PRELU(1, 1, 512, 512); +MACE_BM_PRELU(1, 3, 128, 128); +MACE_BM_PRELU(1, 3, 512, 512); +MACE_BM_PRELU(1, 32, 112, 112); +MACE_BM_PRELU(1, 64, 256, 256); namespace { template @@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TanhBenchmark(iters, N, C, H, W); \ } \ - BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_TANH(N, C, H, W) \ - BM_TANH_MACRO(N, C, H, W, float, CPU); \ - BM_TANH_MACRO(N, C, H, W, float, GPU); \ - BM_TANH_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_TANH(N, C, H, W) \ + MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_TANH_MACRO(N, C, H, W, half, GPU); -BM_TANH(1, 1, 512, 512); -BM_TANH(1, 3, 128, 128); -BM_TANH(1, 3, 512, 512); -BM_TANH(1, 32, 112, 112); -BM_TANH(1, 64, 256, 256); +MACE_BM_TANH(1, 1, 512, 512); +MACE_BM_TANH(1, 3, 128, 128); +MACE_BM_TANH(1, 3, 512, 512); +MACE_BM_TANH(1, 32, 112, 112); +MACE_BM_TANH(1, 64, 256, 256); namespace { template @@ -343,26 +347,26 @@ void SigmoidBenchmark( } } // namespace -#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SigmoidBenchmark(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) - -#define BM_SIGMOID(N, C, H, W) \ - BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ - BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ - BM_SIGMOID_MACRO(N, C, H, W, half, GPU); - -BM_SIGMOID(1, 1, 512, 512); -BM_SIGMOID(1, 3, 128, 128); -BM_SIGMOID(1, 3, 512, 512); -BM_SIGMOID(1, 32, 112, 112); -BM_SIGMOID(1, 64, 256, 256); +#define MACE_BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SigmoidBenchmark(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_SIGMOID(N, C, H, W) \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_SIGMOID_MACRO(N, C, H, W, half, GPU); + +MACE_BM_SIGMOID(1, 1, 512, 512); +MACE_BM_SIGMOID(1, 3, 128, 128); +MACE_BM_SIGMOID(1, 3, 512, 512); +MACE_BM_SIGMOID(1, 32, 112, 112); +MACE_BM_SIGMOID(1, 64, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 1ad27c2f53e3a9493c4cd6cc3c05500f400ec35f..6bfc4c09503501c0d4aa601335edbfb6fc86453c 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_AddN(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + AddNOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + AddNOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + AddNOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 36c9948e9e3e80bf91c31a7ad27a41b88a1bf4dc..1b72c79124cb2cd94fc53687e243fa804023d563 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { } } // namespace -#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ - static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ +#define MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * INPUTS * N * H * W * C; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ AddNBenchmark(iters, INPUTS, N, H, W, C); \ } \ - BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_ADDN(INPUTS, N, H, W, C) \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); +#define MACE_BM_ADDN(INPUTS, N, H, W, C) \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ + MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); -BM_ADDN(2, 1, 256, 256, 32); -BM_ADDN(2, 1, 128, 128, 32); -BM_ADDN(4, 1, 128, 128, 3); -BM_ADDN(2, 1, 256, 256, 3); -BM_ADDN(2, 1, 512, 512, 3); +MACE_BM_ADDN(2, 1, 256, 256, 32); +MACE_BM_ADDN(2, 1, 128, 128, 32); +MACE_BM_ADDN(4, 1, 128, 128, 3); +MACE_BM_ADDN(2, 1, 256, 256, 3); +MACE_BM_ADDN(2, 1, 512, 512, 3); } // namespace test } // namespace ops diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index c9e6db59887fdbadfa8b9abdcd9a57df07abfb9b..fe63559285dc91f061199d4747feee5e06f2d8c3 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_BatchNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index 966b039f7f5a0974b70b6f0e9865e87e43c30c5a..5963ee4b1634534536a6aef36f2e31cd5bf211a6 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -60,8 +60,8 @@ class BatchNormOp : public Operator { kernels::BatchNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index f07966061f1bd0fd58806d439679f43ff68c3d99..648ddfca6b1606cd52f5343767d48533766d1143 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -95,34 +95,34 @@ void BatchNorm( } } // namespace -#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BatchNorm(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BatchNorm(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_BATCH_NORM(N, C, H, W) \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_BATCH_NORM(N, C, H, W) \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); -BM_BATCH_NORM(1, 1, 512, 512); -BM_BATCH_NORM(1, 3, 128, 128); -BM_BATCH_NORM(1, 3, 512, 512); -BM_BATCH_NORM(1, 32, 112, 112); -BM_BATCH_NORM(1, 64, 256, 256); -BM_BATCH_NORM(1, 64, 512, 512); -BM_BATCH_NORM(1, 128, 56, 56); -BM_BATCH_NORM(1, 128, 256, 256); -BM_BATCH_NORM(1, 256, 14, 14); -BM_BATCH_NORM(1, 512, 14, 14); -BM_BATCH_NORM(1, 1024, 7, 7); -BM_BATCH_NORM(32, 1, 256, 256); -BM_BATCH_NORM(32, 3, 256, 256); +MACE_BM_BATCH_NORM(1, 1, 512, 512); +MACE_BM_BATCH_NORM(1, 3, 128, 128); +MACE_BM_BATCH_NORM(1, 3, 512, 512); +MACE_BM_BATCH_NORM(1, 32, 112, 112); +MACE_BM_BATCH_NORM(1, 64, 256, 256); +MACE_BM_BATCH_NORM(1, 64, 512, 512); +MACE_BM_BATCH_NORM(1, 128, 56, 56); +MACE_BM_BATCH_NORM(1, 128, 256, 256); +MACE_BM_BATCH_NORM(1, 256, 14, 14); +MACE_BM_BATCH_NORM(1, 512, 14, 14); +MACE_BM_BATCH_NORM(1, 1024, 7, 7); +MACE_BM_BATCH_NORM(32, 1, 256, 256); +MACE_BM_BATCH_NORM(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index e5c137e04c86499e9a505ce8b735dc380baf2f67..50bc84ed2f56d46c62ccd3356f6023978373fc6b 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -18,22 +18,22 @@ namespace mace { namespace ops { void Register_BatchToSpaceND(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index b95d0c3361fbad30f7af29271eee7b2e4e8fd0f2..4f688dc427b0bfa1150e2ce340f9d9a899a10a65 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator { kernels::SpaceToBatchFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index fdb7331706afc7635f103bf02e29b30ca29c8cbb..c6b3e25a5b95208db2042384e00460333040ba96 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -64,25 +64,26 @@ void BMBatchToSpace( } } // namespace -#define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \ - static void \ - BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMBatchToSpace(iters, N, C, H, W, ARG); \ - } \ - BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) +#define MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \ + static void \ + MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMBatchToSpace(iters, N, C, H, W, ARG); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) -#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); +#define MACE_BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ + MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ + MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); -BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); -BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); -BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); -BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); +MACE_BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); +MACE_BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); +MACE_BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); +MACE_BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index dd4e20f4cfd8bb7fed973ca60fd2f5a334876afd..deb67368c9964e68c95ef411c16046e9be7506bc 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_BiasAdd(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BiasAddOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h index f69f18e6d47147e7677cd139de7a2adc48881453..1f242253785a63e4de3cc22f1b5f4fea494e5792 100644 --- a/mace/ops/bias_add.h +++ b/mace/ops/bias_add.h @@ -46,8 +46,8 @@ class BiasAddOp : public Operator { kernels::BiasAddFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 851c8a17fc28fb50dc8f33b6472b00f618e9adfe..c0f6ad72b688eab70b8e0d4eb389230ad072048b 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { } } // namespace -#define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BiasAdd(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BiasAdd(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_BIAS_ADD(N, C, H, W) \ - BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ - BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ - BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_BIAS_ADD(N, C, H, W) \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); -BM_BIAS_ADD(1, 1, 512, 512); -BM_BIAS_ADD(1, 3, 128, 128); -BM_BIAS_ADD(1, 3, 512, 512); -BM_BIAS_ADD(1, 32, 112, 112); -BM_BIAS_ADD(1, 64, 256, 256); -BM_BIAS_ADD(1, 64, 512, 512); -BM_BIAS_ADD(1, 128, 56, 56); -BM_BIAS_ADD(1, 128, 256, 256); -BM_BIAS_ADD(1, 256, 14, 14); -BM_BIAS_ADD(1, 512, 14, 14); -BM_BIAS_ADD(1, 1024, 7, 7); -BM_BIAS_ADD(32, 1, 256, 256); -BM_BIAS_ADD(32, 3, 256, 256); +MACE_BM_BIAS_ADD(1, 1, 512, 512); +MACE_BM_BIAS_ADD(1, 3, 128, 128); +MACE_BM_BIAS_ADD(1, 3, 512, 512); +MACE_BM_BIAS_ADD(1, 32, 112, 112); +MACE_BM_BIAS_ADD(1, 64, 256, 256); +MACE_BM_BIAS_ADD(1, 64, 512, 512); +MACE_BM_BIAS_ADD(1, 128, 56, 56); +MACE_BM_BIAS_ADD(1, 128, 256, 256); +MACE_BM_BIAS_ADD(1, 256, 14, 14); +MACE_BM_BIAS_ADD(1, 512, 14, 14); +MACE_BM_BIAS_ADD(1, 1024, 7, 7); +MACE_BM_BIAS_ADD(32, 1, 256, 256); +MACE_BM_BIAS_ADD(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc index 5a567d7495f4a1abe5cad4eb7389bf19a36e21fb..04cb9b8292340004600353c760a6dd43e8555104 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_to_image.cc @@ -18,17 +18,17 @@ namespace mace { namespace ops { void Register_BufferToImage(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferToImageOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BufferToImageOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferToImageOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + BufferToImageOp); } } // namespace ops diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index ae8de6965d7647e7a500d908231279795d1841c9..6d1d03957c9e2863d1fd123ef3be541c0e87bc48 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -42,8 +42,8 @@ class BufferToImageOp : public Operator { kernels::BufferToImageFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 3ef8ce789bb989cf2f54d922a908819f1c6c966c..7d94c525f7a59011d53f3e6f9f78538e1f91b271 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -54,36 +54,36 @@ void FilterBufferToImage(int iters, } } // namespace -#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ - static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ +#define MACE_BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ + static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * O * I * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FilterBufferToImage(iters, O, I, H, W); \ } \ - BENCHMARK(BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_B2I(O, I, H, W) \ - BM_B2I_MACRO(O, I, H, W, float, GPU); \ - BM_B2I_MACRO(O, I, H, W, half, GPU); +#define MACE_BM_B2I(O, I, H, W) \ + MACE_BM_B2I_MACRO(O, I, H, W, float, GPU); \ + MACE_BM_B2I_MACRO(O, I, H, W, half, GPU); -BM_B2I(5, 3, 3, 3); -BM_B2I(5, 3, 7, 7); -BM_B2I(32, 16, 1, 1); -BM_B2I(32, 16, 3, 3); -BM_B2I(32, 16, 5, 5); -BM_B2I(32, 16, 7, 7); -BM_B2I(64, 32, 1, 1); -BM_B2I(64, 32, 3, 3); -BM_B2I(64, 32, 5, 5); -BM_B2I(64, 32, 7, 7); -BM_B2I(128, 64, 1, 1); -BM_B2I(128, 64, 3, 3); -BM_B2I(128, 32, 1, 1); -BM_B2I(128, 32, 3, 3); -BM_B2I(256, 32, 1, 1); -BM_B2I(256, 32, 3, 3); +MACE_BM_B2I(5, 3, 3, 3); +MACE_BM_B2I(5, 3, 7, 7); +MACE_BM_B2I(32, 16, 1, 1); +MACE_BM_B2I(32, 16, 3, 3); +MACE_BM_B2I(32, 16, 5, 5); +MACE_BM_B2I(32, 16, 7, 7); +MACE_BM_B2I(64, 32, 1, 1); +MACE_BM_B2I(64, 32, 3, 3); +MACE_BM_B2I(64, 32, 5, 5); +MACE_BM_B2I(64, 32, 7, 7); +MACE_BM_B2I(128, 64, 1, 1); +MACE_BM_B2I(128, 64, 3, 3); +MACE_BM_B2I(128, 32, 1, 1); +MACE_BM_B2I(128, 32, 3, 3); +MACE_BM_B2I(256, 32, 1, 1); +MACE_BM_B2I(256, 32, 3, 3); } // namespace test } // namespace ops diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index f7d23e31ae0788dd29d0b60661894667b76aacce..f3311be64271876ebec1b7967d38faecdfe1f200 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_ChannelShuffle(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ChannelShuffleOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h index 246b286b182e86166a93e1e1e8b5bcc8f31c110c..bd9234c1abab8c3c6391f781a4b7177c1a82d5b1 100644 --- a/mace/ops/channel_shuffle.h +++ b/mace/ops/channel_shuffle.h @@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator { protected: const int group_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::ChannelShuffleFunctor functor_; diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index 205d74e54145a801962c577e39f11085748af883..49f494c8a3bf23f64253ea68d314cbdc484d8f1a 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -69,25 +69,26 @@ void ChannelShuffle( } } // namespace -#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ChannelShuffle(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ChannelShuffle(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ - BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); -BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); -BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); -BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); +MACE_BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8); } // namespace test } // namespace ops diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 7f6f6fd893527e41b79e61997dd60460ed2ab81d..0275d497ca27e9d8285603c1ee8e2efae9065632 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Concat(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ConcatOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ConcatOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ConcatOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/concat.h b/mace/ops/concat.h index 514771050b31bca99017d677b5c6f27ba16ba20e..be76371494a2116f180420ddadf75090bb103b54 100644 --- a/mace/ops/concat.h +++ b/mace/ops/concat.h @@ -51,7 +51,7 @@ class ConcatOp : public Operator { kernels::ConcatFunctor functor_; private: - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index af6f2943e2eb123c25beed279c9a8cad6f8a7e13..faf784c55d1aecc188572b96516d57596a7d8eba 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) { } } // namespace -#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ - static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \ - ConcatHelper(iters, DIM0, DIM1); \ - } \ - BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1) +#define MACE_BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ + static void MACE_BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \ + ConcatHelper(iters, DIM0, DIM1); \ + } \ + MACE_BENCHMARK(MACE_BM_CONCAT_CPU_##DIM0##_##DIM1) -BM_CONCAT_CPU_MACRO(0, 1000); -BM_CONCAT_CPU_MACRO(0, 100000); -BM_CONCAT_CPU_MACRO(1, 1000); -BM_CONCAT_CPU_MACRO(1, 100000); +MACE_BM_CONCAT_CPU_MACRO(0, 1000); +MACE_BM_CONCAT_CPU_MACRO(0, 100000); +MACE_BM_CONCAT_CPU_MACRO(1, 1000); +MACE_BM_CONCAT_CPU_MACRO(1, 100000); namespace { template @@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters, } } // namespace -#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ - static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \ - std::vector shape = {N, H, W, C}; \ - OpenclConcatHelper(iters, shape, shape, 3); \ - } \ - BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) - -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float); - -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half); -BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half); +#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ + static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\ + std::vector shape = {N, H, W, C}; \ + OpenclConcatHelper(iters, shape, shape, 3); \ + } \ + MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) + +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float); + +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half); +MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half); } // namespace test } // namespace ops diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index e5f01d26cd5f1a1b1e4bb428b74342e0d529caec..29d3ac7159c1ca952065c2b7f9bbf28c67fcf9dd 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Conv2D(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Conv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 5e691268e4fe62a40e88e953deaf2b1232d8d27f..b15045cd18884e7112c19da7a6c5bdeab53560f0 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase { kernels::Conv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index c0e5e28d7ed20b0aa3efc2f19d304723609dc083..63e8869ac52dd91356e99b634fb8eb33b627eeb4 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -105,11 +105,11 @@ void Conv2d(int iters, // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // in-order. -#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \ - DEVICE) \ +#define MACE_BM_CONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, DEVICE) \ static void \ - BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ - ##_##P##_##OC##_##TYPE##_##DEVICE( \ + MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\ + DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t pad_h = 0, pad_w = 0; \ @@ -128,54 +128,53 @@ void Conv2d(int iters, Conv2d(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ mace::Padding::P, OC); \ } \ - BENCHMARK( \ - BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ - ##_##P##_##OC##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\ + DILATION##_##P##_##OC##_##TYPE##_##DEVICE) -#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); +#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); // Filter sizes and data alignments -BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); -BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); -BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); +MACE_BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); +MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); // 3 channels input -BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); -BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); -BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); +MACE_BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); +MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); +MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); // Dilations -BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); -BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); +MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); +MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); // MobileNet -BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); -BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); +MACE_BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); +MACE_BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); -BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); -BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); +MACE_BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); +MACE_BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); -BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192); -BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192); -BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); - -BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); -BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); -BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); +MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); +MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 4666296d8bf873c9dce2b855ee97fc92b5301cfc..342e27aa13151837febf2256927787d5205585ab 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Deconv2D(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + Deconv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index c73bea865e8c3de3454e2b1c453b68d2cfac6431..1c52e10c80f1544eadb3faa0f85d05be0b70dadd 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase { kernels::Deconv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index af8d3b87e853b8330df962bf4841f9fbdff3c296..2a414e3c5e7b2e8a83855a6bec7ea9aa606d8d41 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -94,11 +94,11 @@ static void Deconv2d(int iters, // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // in-order. -#define BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, \ - DEVICE) \ +#define MACE_BM_DECONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, DEVICE) \ static void \ - BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW\ - ##_##P##_##OC##_##TYPE##_##DEVICE( \ + MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\ + ##OW##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t oh = OH; \ @@ -110,30 +110,30 @@ static void Deconv2d(int iters, Deconv2d(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ mace::Padding::P, OC); \ } \ - BENCHMARK( \ - BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW##\ - _##P##_##OC##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\ + ##OW##_##P##_##OC##_##TYPE##_##DEVICE) // TODO(liutuo): add cpu benchmark when optimized. -#define BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ - BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ - BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); +#define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ + MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ + MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); -BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); -BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); +MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); +MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); -BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); -BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); -BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); -BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); -BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); +MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); +MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); +MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); +MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); -BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); +MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); -BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); -BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); -BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); -BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); +MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); +MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); +MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); } // namespace test } // namespace ops diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 2f0e38c4a3ea48f4c09119f0c791f34f8fe8f9fc..682a6770f9fa743f5cf17750ba18307d3eed4fb2 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_DepthToSpace(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthToSpaceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 0df9ecbdc9a3292fb87d592199ee024438358511..4be3f2a0dc08128eec9ca7141df414ab73c9bf81 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator { protected: const int block_size_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::DepthToSpaceOpFunctor functor_; diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index bf05f6929cdd292beaa69854d1ef04b1ed159cda..431151671dac90fb6b4e535db62ea256ff17d794 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -69,25 +69,26 @@ void DepthToSpace( } } // namespace -#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthToSpace(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthToSpace(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ - BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_DEPTH_TO_SPACE(N, C, H, W, G) \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); -BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); -BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); -BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); +MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); } // namespace test } // namespace ops diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index fc0205dbaa6c6f545b62d8a71460aae71e8d804e..cdb53595088bed8b163a74bf54707b3d0f129ab7 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index 2d6b1388ae73b3808ecf07335d3bd06687ad3486..2762aea5f48114413b90cc0250ab010de4486244 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase { kernels::DepthwiseConv2dFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, FILTER, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index acee2265f17fb1cead26efd84c9173ee4bd73672..ea847fd103f6c5af97ee7cea7a2cac7762c2a3c7 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters, } } // namespace -#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \ - DEVICE) \ - static void \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ - ##P##_##M##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t dilation = 1; \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - int64_t pad_h = 0, pad_w = 0; \ - if (P == SAME) { \ - pad_h = KH / 2; \ - pad_w = KW / 2; \ - } \ - int64_t oh = \ - (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ - int64_t ow = \ - (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, M); \ - } \ - BENCHMARK( \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ - ##P##_##M##_##TYPE##_##DEVICE) +#define MACE_BM_DEPTHWISE_CONV_2D_MACRO( \ + N, C, H, W, KH, KW, STRIDE, P, M, TYPE, DEVICE) \ + static void \ + MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\ + ##_##P##_##M##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t dilation = 1; \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + int64_t pad_h = 0, pad_w = 0; \ + if (P == SAME) { \ + pad_h = KH / 2; \ + pad_w = KW / 2; \ + } \ + int64_t oh = \ + (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ + int64_t ow = \ + (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ + const int64_t macc = \ + static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ + mace::Padding::P, M); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\ + ##_##P##_##M##_##TYPE##_##DEVICE) -#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); +#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); -BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); -BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); +MACE_BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); } // namespace test diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 427716eadb1c6ee50b40c74461835e31c7419049..bbb214352d3007e53d87ac859050bee5146d59d0 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Eltwise(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 95c48b78bc16265a15700dc4323e3505990795ac..efa87dd9ca5db73c6a01fcf843de7550bd033918 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -43,7 +43,7 @@ class EltwiseOp : public Operator { kernels::EltwiseFunctor functor_; private: - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 55308069e4d79801b2d67dfeb9bcf9ee01f2ad9b..d12c97b3aa87886b5c90db60a02a792c7b0d561e 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -76,30 +76,31 @@ void EltwiseBenchmark( } } // namespace -#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ - static void \ - BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - EltwiseBenchmark( \ - iters, static_cast(ELT_TYPE), N, H, W, C); \ - } \ - BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) +#define MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + EltwiseBenchmark( \ + iters, static_cast(ELT_TYPE), N, H, W, C); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ - BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); +#define MACE_BM_ELTWISE(ELT_TYPE, N, H, W, C) \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ + MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); -BM_ELTWISE(2, 1, 128, 128, 32); -BM_ELTWISE(2, 1, 240, 240, 256); -BM_ELTWISE(2, 1, 256, 256, 32); -BM_ELTWISE(0, 1, 128, 128, 32); -BM_ELTWISE(0, 1, 240, 240, 256); -BM_ELTWISE(5, 1, 128, 128, 32); -BM_ELTWISE(5, 1, 240, 240, 256); +MACE_BM_ELTWISE(2, 1, 128, 128, 32); +MACE_BM_ELTWISE(2, 1, 240, 240, 256); +MACE_BM_ELTWISE(2, 1, 256, 256, 32); +MACE_BM_ELTWISE(0, 1, 128, 128, 32); +MACE_BM_ELTWISE(0, 1, 240, 240, 256); +MACE_BM_ELTWISE(5, 1, 128, 128, 32); +MACE_BM_ELTWISE(5, 1, 240, 240, 256); } // namespace test } // namespace ops diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc index 6c46195d88579771ab45f5abe3ceb43189a75678..ace0b857d2c3a8a8997559424e536974c6ae634b 100644 --- a/mace/ops/folded_batch_norm.cc +++ b/mace/ops/folded_batch_norm.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h index c9047ef82bbb38ab8e0de38bec1a661510fb9064..03543c1c8848c6bb0ffd40d03c11d0edd9851fa3 100644 --- a/mace/ops/folded_batch_norm.h +++ b/mace/ops/folded_batch_norm.h @@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator { kernels::BatchNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index d747916c1e29fcac8a04fb6e4642ce0a5f34e658..3147a598abf43682b0b599bd443c765b620f09ef 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_FullyConnected(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h index c2539169856fb55de9a1e502ff00cd6a9129d36b..a4b17d4e446f99fb7ea33e8b4762c68f6abeabab 100644 --- a/mace/ops/fully_connected.h +++ b/mace/ops/fully_connected.h @@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator { kernels::FullyConnectedFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index fd1210b6786a0009ab0d814a6ab3ee0dee9e85c5..021b6396a7786ea322936f53617e7b12f4fae1b5 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -82,28 +82,28 @@ void FCBenchmark( } } // namespace -#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ - static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t macc = \ - static_cast(iters) * N * C * H * W * OC + OC; \ - const int64_t tot = \ - static_cast(iters) * (N + OC) * C * H * W + OC; \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - FCBenchmark(iters, N, H, W, C, OC); \ - } \ - BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE) +#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ + static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t macc = \ + static_cast(iters) * N * C * H * W * OC + OC; \ + const int64_t tot = \ + static_cast(iters) * (N + OC) * C * H * W + OC; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + FCBenchmark(iters, N, H, W, C, OC); \ + } \ + MACE_BENCHMARK(MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE) -#define BM_FC(N, H, W, C, OC) \ - BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ - BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ - BM_FC_MACRO(N, H, W, C, OC, half, GPU); +#define MACE_BM_FC(N, H, W, C, OC) \ + MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ + MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ + MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); -BM_FC(1, 16, 16, 32, 32); -BM_FC(1, 8, 8, 32, 1000); -BM_FC(1, 2, 2, 512, 2); -BM_FC(1, 7, 7, 512, 2048); +MACE_BM_FC(1, 16, 16, 32, 32); +MACE_BM_FC(1, 8, 8, 32, 1000); +MACE_BM_FC(1, 2, 2, 512, 2); +MACE_BM_FC(1, 7, 7, 512, 2048); } // namespace test } // namespace ops diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/image_to_buffer.cc index 02bcc5f70cb6c04e8a9d3c0e20832b26c7ed2ea8..168f75b6dcbff8e233375cc547b89a7bc56e3d9f 100644 --- a/mace/ops/image_to_buffer.cc +++ b/mace/ops/image_to_buffer.cc @@ -18,17 +18,17 @@ namespace mace { namespace ops { void Register_ImageToBuffer(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ImageToBufferOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ImageToBufferOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ImageToBufferOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ImageToBufferOp); } } // namespace ops diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index 1365e1a8d369c5f07192962b84ef075ed7f5de73..9d7420336f305e24f6342f065b1c08e2b952457e 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator { kernels::ImageToBufferFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc index 25a58aa1d3b03e1ae09a1f52cfe3351ab0eb803d..8517c0140aba91bfbf79cfeaa4df4918b72d0f9b 100644 --- a/mace/ops/local_response_norm.cc +++ b/mace/ops/local_response_norm.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_LocalResponseNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - LocalResponseNormOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + LocalResponseNormOp); } } // namespace ops diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h index 502a340439a2f563647221e7c4b8e964ea3f4048..6f2bd69150a661c0b01c2849bb63d5aa2775a792 100644 --- a/mace/ops/local_response_norm.h +++ b/mace/ops/local_response_norm.h @@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator { kernels::LocalResponseNormFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc index 689eea72a736702c58e45f38f8bc745d06cee828..ee15c3e01223164f5978415deb9f862e45fa320b 100644 --- a/mace/ops/local_response_norm_benchmark.cc +++ b/mace/ops/local_response_norm_benchmark.cc @@ -55,32 +55,34 @@ static void LocalResponseNorm( net.Sync(); } -#define BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ - int iters) { \ +#define MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void \ + MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ LocalResponseNorm(iters, N, C, H, W); \ } \ - BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_LOCAL_RESPONSE_NORM(N, C, H, W) \ - BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU); +#define MACE_BM_LOCAL_RESPONSE_NORM(N, C, H, W) \ + MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU); -BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128); -BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112); -BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256); -BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512); -BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56); -BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256); -BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14); -BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14); -BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7); -BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256); -BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128); +MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112); +MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512); +MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56); +MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14); +MACE_BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14); +MACE_BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7); +MACE_BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256); +MACE_BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256); } // namespace test } // namespace ops diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index b65bf4841b16f28807ae19dbc97e0a1b52e48c19..fa342659ef4f0bf771a1edc644cc9a9d87932a0d 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_MatMul(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + MatMulOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + MatMulOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + MatMulOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index cc9b86dab67a48b2eeee9eea3352528e04d0cd94..382fdf7c8829a5887f7920a74285ca0a33178c4b 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -67,27 +67,28 @@ void MatMulBenchmark( } } // namespace -#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ - static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ +#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ + static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t macc = static_cast(iters) * N * C * H * W; \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulBenchmark(iters, N, H, C, W); \ } \ - BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) + MACE_BENCHMARK(MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) -#define BM_MATMUL(N, H, C, W) \ - BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ - BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ - BM_MATMUL_MACRO(N, H, C, W, half, GPU); +#define MACE_BM_MATMUL(N, H, C, W) \ + MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ + MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ + MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU); -BM_MATMUL(16, 32, 128, 49); -BM_MATMUL(16, 32, 128, 961); -BM_MATMUL(16, 32, 128, 3969); -BM_MATMUL(16, 128, 128, 49); -BM_MATMUL(16, 128, 128, 961); -BM_MATMUL(16, 128, 128, 3969); +MACE_BM_MATMUL(16, 32, 128, 49); +MACE_BM_MATMUL(16, 32, 128, 961); +MACE_BM_MATMUL(16, 32, 128, 3969); +MACE_BM_MATMUL(16, 128, 128, 49); +MACE_BM_MATMUL(16, 128, 128, 961); +MACE_BM_MATMUL(16, 128, 128, 3969); } // namespace test } // namespace ops diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 8bcf2028426a1dbe6085074811fb1e01c9bd4e60..6875de6ab314b4d2ed183d7087e89774ebfacaed 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -18,23 +18,23 @@ namespace mace { namespace ops { void Register_Pad(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PadOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PadOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PadOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index 4be091dbf27880c6f396c08fd497bafbd6d00573..c5172f8ca9600ea9225fb3929078df5d0ee3d7a4 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height, } } // namespace -#define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \ - static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - Pad(iters, N, H, W, C, PAD); \ - } \ - BENCHMARK(BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE) +#define MACE_BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \ + static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + Pad(iters, N, H, W, C, PAD); \ + } \ + MACE_BENCHMARK(MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE) -#define BM_PAD(N, H, W, C, PAD) \ - BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ - BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ - BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); +#define MACE_BM_PAD(N, H, W, C, PAD) \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ + MACE_BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); -BM_PAD(1, 512, 512, 1, 2); -BM_PAD(1, 112, 112, 64, 1); -BM_PAD(1, 256, 256, 32, 2); -BM_PAD(1, 512, 512, 16, 2); +MACE_BM_PAD(1, 512, 512, 1, 2); +MACE_BM_PAD(1, 112, 112, 64, 1); +MACE_BM_PAD(1, 256, 256, 32, 2); +MACE_BM_PAD(1, 512, 512, 16, 2); } // namespace test } // namespace ops diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index ac15bb1b8cb63fbccefc28145f556e90d87b003a..25cd44aad70a3052da27aa6e61b9c173edb27058 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Pooling(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PoolingOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PoolingOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + PoolingOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index af46c0cdbdbad2c4e6cecd8524bdd8abdd202991..fac4e1dd53b62c811aa40f2b7dfe7b96c1610213 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase { PoolingType pooling_type_; kernels::PoolingFunctor functor_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 1f767c229fd0bc1db2652b7e7d5035804d95831d..d0da9b47b52735698be2d62473a58428731197e2 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -87,29 +87,29 @@ void Pooling(int iters, } } // namespace -#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \ - static void \ - BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ - ##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(float))); \ - Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ - PoolingType::PO); \ - } \ - BENCHMARK( \ - BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ +#define MACE_BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \ + static void \ + MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ + ##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(float))); \ + Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ + PoolingType::PO); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ ##DEVICE) -#define BM_POOLING(N, C, H, W, K, S, PA, PO) \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); +#define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \ + MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ + MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); -BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX); -BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX); +MACE_BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX); } // namespace test } // namespace ops diff --git a/mace/ops/proposal.cc b/mace/ops/proposal.cc index 98afe966fc46579bf367be8287294e8f5a0b7402..4558bbb3d3bad5e9214303fb5f16401bac48308b 100644 --- a/mace/ops/proposal.cc +++ b/mace/ops/proposal.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Proposal(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ProposalOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ProposalOp); } } // namespace ops diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h index 36021adda48258f4381527b8631470b6686b02b1..1afabb8fe36800a4e09af30d0e14dd9586256376 100644 --- a/mace/ops/proposal.h +++ b/mace/ops/proposal.h @@ -49,8 +49,8 @@ class ProposalOp : public Operator { kernels::ProposalFunctor functor_; protected: - OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); - OP_OUTPUT_TAGS(ROIS); + MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); + MACE_OP_OUTPUT_TAGS(ROIS); }; } // namespace ops diff --git a/mace/ops/psroi_align.cc b/mace/ops/psroi_align.cc index d21db0cc7349e70cf5badfa144c4804c61bdf3b8..17a02d61da819ef88beb84f24ffb9b2dc6176901 100644 --- a/mace/ops/psroi_align.cc +++ b/mace/ops/psroi_align.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_PSROIAlign(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PSROIAlignOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + PSROIAlignOp); } } // namespace ops diff --git a/mace/ops/psroi_align.h b/mace/ops/psroi_align.h index 82bd0c44400e2c9dec442c5f733f7a354284f19a..b75956827b56b1c27c033fc2855eebd751cfb236 100644 --- a/mace/ops/psroi_align.h +++ b/mace/ops/psroi_align.h @@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator { kernels::PSROIAlignFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, ROIS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, ROIS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc index 5121552ee845f3a836e83a5f1fca88ee8ef554f2..ffd4cd78c0f68cada47f75ca7ec8413e5b830b41 100644 --- a/mace/ops/quantize.cc +++ b/mace/ops/quantize.cc @@ -18,27 +18,27 @@ namespace mace { namespace ops { void Register_Quantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - QuantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + QuantizeOp); } void Register_Dequantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DequantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + DequantizeOp); } void Register_Requantize(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - RequantizeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + RequantizeOp); } } // namespace ops diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h index 212d35937b38c28897c09177f73e068424afefa7..bf0d1534c38326a55f29161c79052c3e43b4d2db 100644 --- a/mace/ops/quantize.h +++ b/mace/ops/quantize.h @@ -50,8 +50,8 @@ class QuantizeOp : public Operator { kernels::QuantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); - OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); }; template @@ -79,8 +79,8 @@ class DequantizeOp : public Operator { kernels::DequantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; template @@ -131,8 +131,8 @@ class RequantizeOp : public Operator { kernels::RequantizeFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX); - OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); + MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX); + MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); }; } // namespace ops diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index 708e7e09421554cb70336f33c7b01aa53e3c2a25..4390c520dd2daf7d7d67fa2e393ace58ec392b61 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Reshape(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ReshapeOp); } } // namespace ops diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h index 371e3b4071d24e242fbc050e18d00cabb31b3ed4..fe1df988b5dcf911f4bceb5fa122ea9487ec6712 100644 --- a/mace/ops/reshape.h +++ b/mace/ops/reshape.h @@ -69,8 +69,8 @@ class ReshapeOp : public Operator { kernels::ReshapeFunctor functor_; private: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 65a84926eae43765ffedd42c2cfbbdf31bcc6fc6..e18d70387345fc1bb857deba2bbaa9945c054c53 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_ResizeBilinear(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index ad9545619757f6a3e349c57b9193e64f3497a691..1f21780135a1cca24788630a796c6a6028b31a53 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters, } } // namespace -#define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ - static void \ - BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ - ##DEVICE( \ - int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ - const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ - } \ - BENCHMARK( \ - BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ +#define MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ + static void \ + MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ + ##DEVICE( \ + int iters) { \ + const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE) -#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ - BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); +#define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); -BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); - -BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15); -BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30); -BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60); -BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480); -BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480); -BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016); +MACE_BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15); +MACE_BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30); +MACE_BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60); +MACE_BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480); +MACE_BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016); } // namespace test } // namespace ops diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index a146e00be53168e32882c65a79a0bfc2108afebf..a9b1c9bd2c494721801345c7910b0768cc0c6f16 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Slice(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SliceOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SliceOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SliceOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SliceOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/slice.h b/mace/ops/slice.h index 567707ff753f0aad953febd3cb192822c59ee55e..d917f132558bab7b5d5f25480df4d02026b56de3 100644 --- a/mace/ops/slice.h +++ b/mace/ops/slice.h @@ -46,7 +46,7 @@ class SliceOp : public Operator { kernels::SliceFunctor functor_; private: - OP_INPUT_TAGS(INPUT); + MACE_OP_INPUT_TAGS(INPUT); }; } // namespace ops diff --git a/mace/ops/slice_benchmark.cc b/mace/ops/slice_benchmark.cc index b05138aee2218e7e727d4446f828028404ce1def..c02dbf5c08b8aa35ac4946a04161a70dbbe69b18 100644 --- a/mace/ops/slice_benchmark.cc +++ b/mace/ops/slice_benchmark.cc @@ -73,26 +73,28 @@ void BMSliceHelper(int iters, } } // namespace -#define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \ +#define MACE_BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \ static void \ - BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \ + MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \ + int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSliceHelper(iters, {N, H, W, C}, NO); \ } \ - BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE) -#define BM_SLICE(N, H, W, C, NO) \ - BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ - BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ - BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); +#define MACE_BM_SLICE(N, H, W, C, NO) \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ + MACE_BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); -BM_SLICE(1, 32, 32, 32, 2); -BM_SLICE(1, 32, 32, 128, 2); -BM_SLICE(1, 32, 32, 256, 2); -BM_SLICE(1, 128, 128, 32, 2); -BM_SLICE(1, 128, 128, 128, 2); +MACE_BM_SLICE(1, 32, 32, 32, 2); +MACE_BM_SLICE(1, 32, 32, 128, 2); +MACE_BM_SLICE(1, 32, 32, 256, 2); +MACE_BM_SLICE(1, 128, 128, 32, 2); +MACE_BM_SLICE(1, 128, 128, 128, 2); } // namespace test } // namespace ops diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 2e2e9e7ff5b44a6038c8d80a5c598db78c31061c..eff2b41565ad40140798a566576c08cdfd3c9822 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_Softmax(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SoftmaxOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h index a4459e52ec10a44bc7745e083212479829f3df5a..c6b81d9e1a831a1e4e1a1e0c99eaed9839ef3193 100644 --- a/mace/ops/softmax.h +++ b/mace/ops/softmax.h @@ -40,8 +40,8 @@ class SoftmaxOp : public Operator { kernels::SoftmaxFunctor functor_; protected: - OP_INPUT_TAGS(LOGITS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(LOGITS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index b62eadcb81faaae7e33dda91e2fbf2c9b0640ada..2e9aed7c52b88dc5c7d9d618c90d50231d2181d4 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -70,26 +70,26 @@ void SoftmaxBenchmark( } } // namespace -#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SoftmaxBenchmark(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) +#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SoftmaxBenchmark(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_SOFTMAX(N, C, H, W) \ - BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ - BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ - BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); +#define MACE_BM_SOFTMAX(N, C, H, W) \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ + MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); -BM_SOFTMAX(1, 2, 512, 512); -BM_SOFTMAX(1, 3, 512, 512); -BM_SOFTMAX(1, 4, 512, 512); -BM_SOFTMAX(1, 10, 256, 256); -BM_SOFTMAX(1, 1024, 7, 7); +MACE_BM_SOFTMAX(1, 2, 512, 512); +MACE_BM_SOFTMAX(1, 3, 512, 512); +MACE_BM_SOFTMAX(1, 4, 512, 512); +MACE_BM_SOFTMAX(1, 10, 256, 256); +MACE_BM_SOFTMAX(1, 1024, 7, 7); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index aac1932739bb7292efaf0feb6f0e596f40843a62..ca905e785ee884ebc6e6f81d9e202fe48a6720a5 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -18,23 +18,23 @@ namespace mace { namespace ops { void Register_SpaceToBatchND(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index ebeb7975aa5574885692525b1e6e683b020f447d..faef4b575e31abb8ab9f8c0418528f25d1fdc530 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator { kernels::SpaceToBatchFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index 524a32cec2849657c74d37453047e33909cfc6f5..272c487c74e74764c24f7d5edc0de35eceac8dd6 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -64,27 +64,27 @@ void BMSpaceToBatch( } } // namespace -#define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \ - static void \ - BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ - } \ - BENCHMARK( \ - BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) +#define MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \ + static void \ + MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) -#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); +#define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); -BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); -BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); +MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); +MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 2932f8ea3eb9c65da90f076f24441dba06499968..1807226505a1bd73aad7c1426cbf0cf37d74e108 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -18,24 +18,24 @@ namespace mace { namespace ops { void Register_SpaceToDepth(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + SpaceToDepthOp); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 33c3d1a5c60163b9a752676b96ab170da6a489f2..cece41f897bf5547e9a9d7c3b6994cd6df9d92c4 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator { } protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); private: kernels::DepthToSpaceOpFunctor functor_; diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 66c90b641b7ad030f63ba89d80dafed6fe4e458a..97d3cb033ef6a54222a38e16194009ad97b09eae 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -69,25 +69,26 @@ void SpaceToDepth( } } // namespace -#define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \ - static void \ - BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - SpaceToDepth(iters, N, C, H, W, G); \ - } \ - BENCHMARK(BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) +#define MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SpaceToDepth(iters, N, C, H, W, G); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) -#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ - BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); +#define MACE_BM_SPACE_TO_DEPTH(N, C, H, W, G) \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ + MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); -BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); -BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); -BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); +MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); } // namespace test } // namespace ops diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc index 2ed6103d75cc06a01e97810d10f2ae72375792e5..9571fcbf5b3a608ea503ba11caac18cc28cac5be 100644 --- a/mace/ops/transpose.cc +++ b/mace/ops/transpose.cc @@ -18,11 +18,11 @@ namespace mace { namespace ops { void Register_Transpose(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - TransposeOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + TransposeOp); } } // namespace ops diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h index 877078d069201a3ca761be59f84b184cbbfb1ffc..5fb497ac83feeb5103166298d6c42ca8ea35da0c 100644 --- a/mace/ops/transpose.h +++ b/mace/ops/transpose.h @@ -50,8 +50,8 @@ class TransposeOp : public Operator { std::vector dims_; kernels::TransposeFunctor functor_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace mace diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc index 24e6f2ffe44499de11da8fd2eb22a1010401c6b6..aaf3faaa703f71a17a29154304c42edcaa70e01c 100644 --- a/mace/ops/transpose_benchmark.cc +++ b/mace/ops/transpose_benchmark.cc @@ -55,41 +55,41 @@ void TransposeBenchmark(int iters, } } // namespace -#define BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \ - static void BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ +#define MACE_BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \ + static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {H, W}, {1, 0}); \ } \ - BENCHMARK(BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE) - -#define BM_TRANSPOSE2D(H, W) \ - BM_TRANSPOSE2D_MACRO(H, W, float, CPU); - -#define BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \ - static void \ - BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ - } \ - BENCHMARK( \ - BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE) - -#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \ - BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU); - - -BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2); -BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1); -BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2); -BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1); -BM_TRANSPOSE2D(128, 128); -BM_TRANSPOSE2D(512, 512); + MACE_BENCHMARK(MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_TRANSPOSE2D(H, W) \ + MACE_BM_TRANSPOSE2D_MACRO(H, W, float, CPU); + +#define MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \ + static void \ + MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ + DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ + DEVICE) + +#define MACE_BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \ + MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU); + +MACE_BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2); +MACE_BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1); +MACE_BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2); +MACE_BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1); +MACE_BM_TRANSPOSE2D(128, 128); +MACE_BM_TRANSPOSE2D(512, 512); } // namespace test } // namespace ops diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc index 763f1f165fc0ed88a9c8b4f50a91cde40c5688f8..f84b69a2074823bd9c97df1a5ba14acd7719ce02 100644 --- a/mace/ops/winograd_inverse_transform.cc +++ b/mace/ops/winograd_inverse_transform.cc @@ -19,17 +19,17 @@ namespace ops { void Register_WinogradInverseTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); #else MACE_UNUSED(op_registry); #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h index 71807b31c5f1e654950fd475fed6278a09accfd9..d08ce0f6baf98a9bb3d4d818d11ed7c7a5980467 100644 --- a/mace/ops/winograd_inverse_transform.h +++ b/mace/ops/winograd_inverse_transform.h @@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator { kernels::WinogradInverseTransformFunctor functor_; protected: - OP_INPUT_TAGS(INPUT, BIAS); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc index 8181cba947b3f4719d799d73b31fe9870e58e20e..24f822551ac536931e661d4ae2193d8509096fd5 100644 --- a/mace/ops/winograd_transform.cc +++ b/mace/ops/winograd_transform.cc @@ -19,17 +19,17 @@ namespace ops { void Register_WinogradTransform(OperatorRegistry *op_registry) { #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::GPU) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); #else MACE_UNUSED(op_registry); #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h index e9a3afbecce12c8c00b71231b88831882c5de73f..90bb5501399853092e2003b058d9ad0f6d8d154b 100644 --- a/mace/ops/winograd_transform.h +++ b/mace/ops/winograd_transform.h @@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator { kernels::WinogradTransformFunctor functor_; protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index bf33332bb28db0a8040f0696dd4fc9e297019677..658b0ea293ce8654af52fc980f21af848857bbd0 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -51,22 +51,24 @@ void BMWinogradTransform( } } // namespace -#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ - static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ +#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void \ + MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradTransform(iters, N, H, W, C); \ } \ - BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \ + MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); -BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); -BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); -BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); +MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); namespace { template @@ -103,24 +105,24 @@ void BMWinogradInverseTransform( } } // namespace -#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ +#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ static void \ - BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\ + int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradInverseTransform(iters, N, H, W, C); \ } \ - BENCHMARK( \ - BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK( \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ - BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ + MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); -BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); +MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); } // namespace test } // namespace ops