提交 ccaec70c 编写于 作者: L Liangliang He

Merge branch 'add_MACE_prefix_for_macros' into 'master'

add MACE_ prefix for macros

See merge request !517
...@@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) ...@@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC #undef MACE_GET_REPEATED_ARGUMENT_FUNC
} // namespace mace } // namespace mace
...@@ -124,20 +124,19 @@ class Operator : public OperatorBase { ...@@ -124,20 +124,19 @@ class Operator : public OperatorBase {
~Operator() noexcept override {} ~Operator() noexcept override {}
}; };
// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of // MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
// the // indices of the operator's inputs and outputs, in order to avoid confusion.
// operator's inputs and outputs, in order to avoid confusion. For example, for // For example, for a fully convolution layer that has input, weight and bias,
// a fully convolution layer that has input, weight and bias, you can define its // you can define its input tags as:
// input tags as: // MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// And in the code, instead of doing // And in the code, instead of doing
// auto& weight = Input(1); // auto& weight = Input(1);
// you can now do // you can now do
// auto& weight = Input(WEIGHT); // auto& weight = Input(WEIGHT);
// to make it more clear. // to make it more clear.
#define OP_INPUT_TAGS(first_input, ...) \ #define MACE_OP_INPUT_TAGS(first_input, ...) \
enum _InputTags { first_input = 0, __VA_ARGS__ } enum _InputTags { first_input = 0, __VA_ARGS__ }
#define OP_OUTPUT_TAGS(first_input, ...) \ #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ } enum _OutputTags { first_input = 0, __VA_ARGS__ }
class OpKeyBuilder { class OpKeyBuilder {
...@@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry, ...@@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry,
const OperatorDef &, const OperatorDef &,
Workspace *); Workspace *);
#define REGISTER_OPERATOR(op_registry, name, ...) \ #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
} // namespace mace } // namespace mace
......
...@@ -32,7 +32,7 @@ inline int64_t NowMicros() { ...@@ -32,7 +32,7 @@ inline int64_t NowMicros() {
namespace mace { namespace mace {
#define MAX_NODE 2048 #define MACE_MAX_NODE 2048
enum { enum {
NN_GRAPH_PERFEVENT_CYCLES = 0, NN_GRAPH_PERFEVENT_CYCLES = 0,
...@@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() { ...@@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0; return hexagon_nn_teardown(nn_id_) == 0;
} }
#define PRINT_BUFSIZE (2 * 1024 * 1024) #define MACE_PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() { void HexagonControlWrapper::PrintLog() {
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf), MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, MACE_PRINT_BUFSIZE) == 0,
"print log error"); "print log error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[] buf; delete[] buf;
...@@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() { ...@@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() {
void HexagonControlWrapper::PrintGraph() { void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph"; LOG(INFO) << "Print Graph";
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf), MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, MACE_PRINT_BUFSIZE) == 0,
"print graph error"); "print graph error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[] buf; delete[] buf;
...@@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) { ...@@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) {
void HexagonControlWrapper::GetPerfInfo() { void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info"; LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE); std::vector<hexagon_nn_perfinfo> perf_info(MACE_MAX_NODE);
unsigned int n_items = 0; unsigned int n_items = 0;
MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MACE_MAX_NODE,
&n_items) == 0, &n_items) == 0,
"get perf info error"); "get perf info error");
...@@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() {
perf_info[i].counter_lo) * perf_info[i].counter_lo) *
1.0f / perf_info[i].executions; 1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE]; char node_type_buf[MACE_MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE); hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MACE_MAX_NODE);
std::string node_type(node_type_buf); std::string node_type(node_type_buf);
LOG(INFO) << "node id: " << perf_info[i].node_id LOG(INFO) << "node id: " << perf_info[i].node_id
<< ", node type: " << node_type << ", node type: " << node_type
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
namespace mace { namespace mace {
#define OP_INVALID -1 #define MACE_OP_INVALID -1
typedef enum op_type_enum { typedef enum op_type_enum {
#define DEF_OP(NAME, ...) OP_##NAME, #define DEF_OP(NAME, ...) OP_##NAME,
...@@ -48,7 +48,7 @@ class OpMap { ...@@ -48,7 +48,7 @@ class OpMap {
return op_map_[op_type]; return op_map_[op_type];
} else { } else {
LOG(ERROR) << "DSP unsupoorted op type: " << op_type; LOG(ERROR) << "DSP unsupoorted op type: " << op_type;
return OP_INVALID; return MACE_OP_INVALID;
} }
} }
......
...@@ -30,9 +30,9 @@ ...@@ -30,9 +30,9 @@
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
// Avoid over-bound accessing memory // Avoid over-bound accessing memory
#define EXTRA_BUFFER_PAD_SIZE 64 #define MACE_EXTRA_BUFFER_PAD_SIZE 64
#else #else
#define EXTRA_BUFFER_PAD_SIZE 0 #define MACE_EXTRA_BUFFER_PAD_SIZE 0
#endif #endif
namespace mace { namespace mace {
...@@ -210,16 +210,16 @@ class Tensor { ...@@ -210,16 +210,16 @@ class Tensor {
image_shape_.clear(); image_shape_.clear();
if (buffer_ != nullptr) { if (buffer_ != nullptr) {
MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
<< raw_size() + EXTRA_BUFFER_PAD_SIZE; << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE); return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} else { } else {
MACE_CHECK(is_buffer_owner_); MACE_CHECK(is_buffer_owner_);
buffer_ = new Buffer(allocator_); buffer_ = new Buffer(allocator_);
return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE); return buffer_->Allocate(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
} }
} }
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include <vector> #include <vector>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \ #define MACE_BENCHMARK(n) \
static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \ static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \
__benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n))) __benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n)))
namespace mace { namespace mace {
......
...@@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type))); new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate( MaceStatus status = tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE); mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE);
if (status != MaceStatus::MACE_SUCCESS) { if (status != MaceStatus::MACE_SUCCESS) {
return status; return status;
} }
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define Conv2dNeonK5x5SnLoadCalc4 \ #define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \ float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \ float32x2_t vf01, vf11, vf21, vf31; \
...@@ -62,7 +62,7 @@ namespace kernels { ...@@ -62,7 +62,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1); vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define Conv2dNeonK5x5SnLoadCalc1 \ #define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \ float32x4_t vf00; \
float32x2_t vf01; \ float32x2_t vf01; \
...@@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2 = vextq_f32(vi0, vi4, 2); vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3); vi3 = vextq_f32(vi0, vi4, 3);
Conv2dNeonK5x5SnLoadCalc4; MACE_Conv2dNeonK5x5SnLoadCalc4;
in_offset += in_width; in_offset += in_width;
filter_ptr0 += 5; filter_ptr0 += 5;
...@@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2 = vextq_f32(vi0, vi4, 2); vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3); vi3 = vextq_f32(vi0, vi4, 3);
Conv2dNeonK5x5SnLoadCalc1; MACE_Conv2dNeonK5x5SnLoadCalc1;
in_offset += in_width; in_offset += in_width;
filter_ptr0 += 5; filter_ptr0 += 5;
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define Conv2dArmv8NeonK7x7SnLoadCalc4 \ #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \ float32x4_t vf10, vf11; \
...@@ -72,7 +72,7 @@ namespace kernels { ...@@ -72,7 +72,7 @@ namespace kernels {
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \ vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#define Conv2dArmv8NeonK7x7SnLoadCalc1 \ #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
...@@ -87,7 +87,7 @@ namespace kernels { ...@@ -87,7 +87,7 @@ namespace kernels {
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \ vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#define Conv2dArmv7NeonK7x7SnLoadCalc4 \ #define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \ float32x4_t vf10, vf11; \
...@@ -138,7 +138,7 @@ namespace kernels { ...@@ -138,7 +138,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \ vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#define Conv2dArmv7NeonK7x7SnLoadCalc1 \ #define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \ /* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \ float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \ vf00 = vld1q_f32(filter_ptr0); \
...@@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6 = vextq_f32(vi4, vi8, 2); vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4; MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6 = vextq_f32(vi4, vi8, 2); vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1; MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4; MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1; MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4; MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += in_width;
...@@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__) #if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1; MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else #else
Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += in_width;
......
...@@ -465,7 +465,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -465,7 +465,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
padded_input_size = padded_input_size =
batch * input_channels * (input_height + pad_top + pad_bottom) batch * input_channels * (input_height + pad_top + pad_bottom)
* (input_width + pad_left + pad_right) * sizeof(float) + * (input_width + pad_left + pad_right) * sizeof(float) +
EXTRA_BUFFER_PAD_SIZE; MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size; total_scratch_size += padded_input_size;
} }
if (extra_output_height != height || extra_output_width != width) { if (extra_output_height != height || extra_output_width != width) {
......
...@@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, ...@@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
// Skip the padded top rows // Skip the padded top rows
if (padding_same_value) { if (padding_same_value) {
#define COPY_INPUT \ #define MACE_COPY_INPUT \
std::fill(output_data, output_data + padded_left, input[0]); \ std::fill(output_data, output_data + padded_left, input[0]); \
output_data += padded_left; \ output_data += padded_left; \
memcpy(output_data, input, width * sizeof(float)); \ memcpy(output_data, input, width * sizeof(float)); \
...@@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, ...@@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
for (int k = 0; k < padded_top; ++k) { for (int k = 0; k < padded_top; ++k) {
COPY_INPUT; MACE_COPY_INPUT;
} }
for (int k = 0; k < height; ++k) { for (int k = 0; k < height; ++k) {
COPY_INPUT; MACE_COPY_INPUT;
input += width; input += width;
} }
input -= width; input -= width;
for (int k = 0; k < padded_bottom; ++k) { for (int k = 0; k < padded_bottom; ++k) {
COPY_INPUT; MACE_COPY_INPUT;
} }
input += width; input += width;
} }
} }
#undef COPY_INPUT #undef MACE_COPY_INPUT
} else { } else {
output_data += padded_top * output_width; output_data += padded_top * output_width;
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
......
...@@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel, ...@@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(stride > 0, "stride should > 0."); MACE_CHECK(stride > 0, "stride should > 0.");
#define WIDTH_BLK 5 #define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride; const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks = ((n_strides + WIDTH_BLK -1)/ WIDTH_BLK) * stride; const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride); const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1; const int padding_h = (paddings[0]+1) >> 1;
const int padding_w = (paddings[0]+1) >> 1; const int padding_w = (paddings[0]+1) >> 1;
......
...@@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test { class OutOfRangeCheckTest : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() {
setenv("MACE_OUT_OF_RANGE_CHECK", "1", 1); setenv("OUT_OF_RANGE_CHECK", "1", 1);
} }
}; };
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Activation(OperatorRegistry *op_registry) { void Register_Activation(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ActivationOp<DeviceType::CPU, float>); ActivationOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ActivationOp<DeviceType::GPU, float>); ActivationOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ActivationOp<DeviceType::GPU, half>); ActivationOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
} }
} // namespace } // namespace
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W) \ #define MACE_BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \ MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, GPU); \ MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \
BM_RELU_MACRO(N, C, H, W, half, GPU); MACE_BM_RELU_MACRO(N, C, H, W, half, GPU);
BM_RELU(1, 1, 512, 512); MACE_BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128); MACE_BM_RELU(1, 3, 128, 128);
BM_RELU(1, 3, 512, 512); MACE_BM_RELU(1, 3, 512, 512);
BM_RELU(1, 32, 112, 112); MACE_BM_RELU(1, 32, 112, 112);
BM_RELU(1, 64, 256, 256); MACE_BM_RELU(1, 64, 256, 256);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
} }
} // namespace } // namespace
#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELUX(N, C, H, W) \ #define MACE_BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \ MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, GPU); \ MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \
BM_RELUX_MACRO(N, C, H, W, half, GPU); MACE_BM_RELUX_MACRO(N, C, H, W, half, GPU);
BM_RELUX(1, 1, 512, 512); MACE_BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128); MACE_BM_RELUX(1, 3, 128, 128);
BM_RELUX(1, 3, 512, 512); MACE_BM_RELUX(1, 3, 512, 512);
BM_RELUX(1, 32, 112, 112); MACE_BM_RELUX(1, 32, 112, 112);
BM_RELUX(1, 64, 256, 256); MACE_BM_RELUX(1, 64, 256, 256);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
} }
} // namespace } // namespace
#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_PRELU(N, C, H, W) \ #define MACE_BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \ MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, GPU); \ MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \
BM_PRELU_MACRO(N, C, H, W, half, GPU); MACE_BM_PRELU_MACRO(N, C, H, W, half, GPU);
BM_PRELU(1, 1, 512, 512); MACE_BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128); MACE_BM_PRELU(1, 3, 128, 128);
BM_PRELU(1, 3, 512, 512); MACE_BM_PRELU(1, 3, 512, 512);
BM_PRELU(1, 32, 112, 112); MACE_BM_PRELU(1, 32, 112, 112);
BM_PRELU(1, 64, 256, 256); MACE_BM_PRELU(1, 64, 256, 256);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
} }
} // namespace } // namespace
#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TANH(N, C, H, W) \ #define MACE_BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \ MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, GPU); \ MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \
BM_TANH_MACRO(N, C, H, W, half, GPU); MACE_BM_TANH_MACRO(N, C, H, W, half, GPU);
BM_TANH(1, 1, 512, 512); MACE_BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128); MACE_BM_TANH(1, 3, 128, 128);
BM_TANH(1, 3, 512, 512); MACE_BM_TANH(1, 3, 512, 512);
BM_TANH(1, 32, 112, 112); MACE_BM_TANH(1, 32, 112, 112);
BM_TANH(1, 64, 256, 256); MACE_BM_TANH(1, 64, 256, 256);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -343,26 +347,26 @@ void SigmoidBenchmark( ...@@ -343,26 +347,26 @@ void SigmoidBenchmark(
} }
} // namespace } // namespace
#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SIGMOID(N, C, H, W) \ #define MACE_BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \ MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \ MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
BM_SIGMOID_MACRO(N, C, H, W, half, GPU); MACE_BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
BM_SIGMOID(1, 1, 512, 512); MACE_BM_SIGMOID(1, 1, 512, 512);
BM_SIGMOID(1, 3, 128, 128); MACE_BM_SIGMOID(1, 3, 128, 128);
BM_SIGMOID(1, 3, 512, 512); MACE_BM_SIGMOID(1, 3, 512, 512);
BM_SIGMOID(1, 32, 112, 112); MACE_BM_SIGMOID(1, 32, 112, 112);
BM_SIGMOID(1, 64, 256, 256); MACE_BM_SIGMOID(1, 64, 256, 256);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_AddN(OperatorRegistry *op_registry) { void Register_AddN(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
AddNOp<DeviceType::CPU, float>); AddNOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
AddNOp<DeviceType::GPU, float>); AddNOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
AddNOp<DeviceType::GPU, half>); AddNOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
} }
} // namespace } // namespace
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ static void \
int iters) { \ MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \ } \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C) \ #define MACE_BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \ MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU); MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
BM_ADDN(2, 1, 256, 256, 32); MACE_BM_ADDN(2, 1, 256, 256, 32);
BM_ADDN(2, 1, 128, 128, 32); MACE_BM_ADDN(2, 1, 128, 128, 32);
BM_ADDN(4, 1, 128, 128, 3); MACE_BM_ADDN(4, 1, 128, 128, 3);
BM_ADDN(2, 1, 256, 256, 3); MACE_BM_ADDN(2, 1, 256, 256, 3);
BM_ADDN(2, 1, 512, 512, 3); MACE_BM_ADDN(2, 1, 512, 512, 3);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_BatchNorm(OperatorRegistry *op_registry) { void Register_BatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchNormOp<DeviceType::CPU, float>); BatchNormOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchNormOp<DeviceType::GPU, float>); BatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BatchNormOp<DeviceType::GPU, half>); BatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -60,8 +60,8 @@ class BatchNormOp : public Operator<D, T> { ...@@ -60,8 +60,8 @@ class BatchNormOp : public Operator<D, T> {
kernels::BatchNormFunctor<D, T> functor_; kernels::BatchNormFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -95,34 +95,34 @@ void BatchNorm( ...@@ -95,34 +95,34 @@ void BatchNorm(
} }
} // namespace } // namespace
#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \ BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_NORM(N, C, H, W) \ #define MACE_BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \ MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU); MACE_BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
BM_BATCH_NORM(1, 1, 512, 512); MACE_BM_BATCH_NORM(1, 1, 512, 512);
BM_BATCH_NORM(1, 3, 128, 128); MACE_BM_BATCH_NORM(1, 3, 128, 128);
BM_BATCH_NORM(1, 3, 512, 512); MACE_BM_BATCH_NORM(1, 3, 512, 512);
BM_BATCH_NORM(1, 32, 112, 112); MACE_BM_BATCH_NORM(1, 32, 112, 112);
BM_BATCH_NORM(1, 64, 256, 256); MACE_BM_BATCH_NORM(1, 64, 256, 256);
BM_BATCH_NORM(1, 64, 512, 512); MACE_BM_BATCH_NORM(1, 64, 512, 512);
BM_BATCH_NORM(1, 128, 56, 56); MACE_BM_BATCH_NORM(1, 128, 56, 56);
BM_BATCH_NORM(1, 128, 256, 256); MACE_BM_BATCH_NORM(1, 128, 256, 256);
BM_BATCH_NORM(1, 256, 14, 14); MACE_BM_BATCH_NORM(1, 256, 14, 14);
BM_BATCH_NORM(1, 512, 14, 14); MACE_BM_BATCH_NORM(1, 512, 14, 14);
BM_BATCH_NORM(1, 1024, 7, 7); MACE_BM_BATCH_NORM(1, 1024, 7, 7);
BM_BATCH_NORM(32, 1, 256, 256); MACE_BM_BATCH_NORM(32, 1, 256, 256);
BM_BATCH_NORM(32, 3, 256, 256); MACE_BM_BATCH_NORM(32, 3, 256, 256);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,22 +18,22 @@ namespace mace { ...@@ -18,22 +18,22 @@ namespace mace {
namespace ops { namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) { void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::CPU, float>); BatchToSpaceNDOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::GPU, float>); BatchToSpaceNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::GPU, half>); BatchToSpaceNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
kernels::SpaceToBatchFunctor<D, T> functor_; kernels::SpaceToBatchFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -64,25 +64,26 @@ void BMBatchToSpace( ...@@ -64,25 +64,26 @@ void BMBatchToSpace(
} }
} // namespace } // namespace
#define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \ #define MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \
static void \ static void \
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \ BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \ } \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ #define MACE_BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); MACE_BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); MACE_BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); MACE_BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); MACE_BM_BATCH_TO_SPACE(64, 32, 32, 32, 8);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_BiasAdd(OperatorRegistry *op_registry) { void Register_BiasAdd(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BiasAddOp<DeviceType::CPU, float>); BiasAddOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BiasAddOp<DeviceType::GPU, float>); BiasAddOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BiasAddOp<DeviceType::GPU, half>); BiasAddOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -46,8 +46,8 @@ class BiasAddOp : public Operator<D, T> { ...@@ -46,8 +46,8 @@ class BiasAddOp : public Operator<D, T> {
kernels::BiasAddFunctor<D, T> functor_; kernels::BiasAddFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, BIAS); MACE_OP_INPUT_TAGS(INPUT, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
} }
} // namespace } // namespace
#define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \ BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BIAS_ADD(N, C, H, W) \ #define MACE_BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \ MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \ MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU); MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
BM_BIAS_ADD(1, 1, 512, 512); MACE_BM_BIAS_ADD(1, 1, 512, 512);
BM_BIAS_ADD(1, 3, 128, 128); MACE_BM_BIAS_ADD(1, 3, 128, 128);
BM_BIAS_ADD(1, 3, 512, 512); MACE_BM_BIAS_ADD(1, 3, 512, 512);
BM_BIAS_ADD(1, 32, 112, 112); MACE_BM_BIAS_ADD(1, 32, 112, 112);
BM_BIAS_ADD(1, 64, 256, 256); MACE_BM_BIAS_ADD(1, 64, 256, 256);
BM_BIAS_ADD(1, 64, 512, 512); MACE_BM_BIAS_ADD(1, 64, 512, 512);
BM_BIAS_ADD(1, 128, 56, 56); MACE_BM_BIAS_ADD(1, 128, 56, 56);
BM_BIAS_ADD(1, 128, 256, 256); MACE_BM_BIAS_ADD(1, 128, 256, 256);
BM_BIAS_ADD(1, 256, 14, 14); MACE_BM_BIAS_ADD(1, 256, 14, 14);
BM_BIAS_ADD(1, 512, 14, 14); MACE_BM_BIAS_ADD(1, 512, 14, 14);
BM_BIAS_ADD(1, 1024, 7, 7); MACE_BM_BIAS_ADD(1, 1024, 7, 7);
BM_BIAS_ADD(32, 1, 256, 256); MACE_BM_BIAS_ADD(32, 1, 256, 256);
BM_BIAS_ADD(32, 3, 256, 256); MACE_BM_BIAS_ADD(32, 3, 256, 256);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,17 +18,17 @@ namespace mace { ...@@ -18,17 +18,17 @@ namespace mace {
namespace ops { namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) { void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::GPU, float>); BufferToImageOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::GPU, half>); BufferToImageOp<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -42,8 +42,8 @@ class BufferToImageOp : public Operator<D, T> { ...@@ -42,8 +42,8 @@ class BufferToImageOp : public Operator<D, T> {
kernels::BufferToImageFunctor<D, T> functor_; kernels::BufferToImageFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -54,36 +54,36 @@ void FilterBufferToImage(int iters, ...@@ -54,36 +54,36 @@ void FilterBufferToImage(int iters,
} }
} // namespace } // namespace
#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ #define MACE_BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \
static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \ FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \
} \ } \
BENCHMARK(BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_B2I(O, I, H, W) \ #define MACE_BM_B2I(O, I, H, W) \
BM_B2I_MACRO(O, I, H, W, float, GPU); \ MACE_BM_B2I_MACRO(O, I, H, W, float, GPU); \
BM_B2I_MACRO(O, I, H, W, half, GPU); MACE_BM_B2I_MACRO(O, I, H, W, half, GPU);
BM_B2I(5, 3, 3, 3); MACE_BM_B2I(5, 3, 3, 3);
BM_B2I(5, 3, 7, 7); MACE_BM_B2I(5, 3, 7, 7);
BM_B2I(32, 16, 1, 1); MACE_BM_B2I(32, 16, 1, 1);
BM_B2I(32, 16, 3, 3); MACE_BM_B2I(32, 16, 3, 3);
BM_B2I(32, 16, 5, 5); MACE_BM_B2I(32, 16, 5, 5);
BM_B2I(32, 16, 7, 7); MACE_BM_B2I(32, 16, 7, 7);
BM_B2I(64, 32, 1, 1); MACE_BM_B2I(64, 32, 1, 1);
BM_B2I(64, 32, 3, 3); MACE_BM_B2I(64, 32, 3, 3);
BM_B2I(64, 32, 5, 5); MACE_BM_B2I(64, 32, 5, 5);
BM_B2I(64, 32, 7, 7); MACE_BM_B2I(64, 32, 7, 7);
BM_B2I(128, 64, 1, 1); MACE_BM_B2I(128, 64, 1, 1);
BM_B2I(128, 64, 3, 3); MACE_BM_B2I(128, 64, 3, 3);
BM_B2I(128, 32, 1, 1); MACE_BM_B2I(128, 32, 1, 1);
BM_B2I(128, 32, 3, 3); MACE_BM_B2I(128, 32, 3, 3);
BM_B2I(256, 32, 1, 1); MACE_BM_B2I(256, 32, 1, 1);
BM_B2I(256, 32, 3, 3); MACE_BM_B2I(256, 32, 3, 3);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_ChannelShuffle(OperatorRegistry *op_registry) { void Register_ChannelShuffle(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ChannelShuffleOp<DeviceType::CPU, float>); ChannelShuffleOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ChannelShuffleOp<DeviceType::GPU, float>); ChannelShuffleOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ChannelShuffleOp<DeviceType::GPU, half>); ChannelShuffleOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator<D, T> { ...@@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator<D, T> {
protected: protected:
const int group_; const int group_;
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
private: private:
kernels::ChannelShuffleFunctor<D, T> functor_; kernels::ChannelShuffleFunctor<D, T> functor_;
......
...@@ -69,25 +69,26 @@ void ChannelShuffle( ...@@ -69,25 +69,26 @@ void ChannelShuffle(
} }
} // namespace } // namespace
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ #define MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \ static void \
BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \ ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \ #define MACE_BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \ MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \ MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU); MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8); MACE_BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8); MACE_BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8); MACE_BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Concat(OperatorRegistry *op_registry) { void Register_Concat(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ConcatOp<DeviceType::CPU, float>); ConcatOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ConcatOp<DeviceType::GPU, float>); ConcatOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ConcatOp<DeviceType::GPU, half>); ConcatOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -51,7 +51,7 @@ class ConcatOp : public Operator<D, T> { ...@@ -51,7 +51,7 @@ class ConcatOp : public Operator<D, T> {
kernels::ConcatFunctor<D, T> functor_; kernels::ConcatFunctor<D, T> functor_;
private: private:
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) { ...@@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) {
} }
} // namespace } // namespace
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ #define MACE_BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \ static void MACE_BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \ ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \ } \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1) MACE_BENCHMARK(MACE_BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO(0, 1000); MACE_BM_CONCAT_CPU_MACRO(0, 1000);
BM_CONCAT_CPU_MACRO(0, 100000); MACE_BM_CONCAT_CPU_MACRO(0, 100000);
BM_CONCAT_CPU_MACRO(1, 1000); MACE_BM_CONCAT_CPU_MACRO(1, 1000);
BM_CONCAT_CPU_MACRO(1, 100000); MACE_BM_CONCAT_CPU_MACRO(1, 100000);
namespace { namespace {
template <typename T> template <typename T>
...@@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters, ...@@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters,
} }
} // namespace } // namespace
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \ static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
std::vector<index_t> shape = {N, H, W, C}; \ std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \ OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \ } \
BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half); MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Conv2D(OperatorRegistry *op_registry) { void Register_Conv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Conv2dOp<DeviceType::CPU, float>); Conv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Conv2dOp<DeviceType::GPU, float>); Conv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
Conv2dOp<DeviceType::GPU, half>); Conv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
kernels::Conv2dFunctor<D, T> functor_; kernels::Conv2dFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS); MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -105,11 +105,11 @@ void Conv2d(int iters, ...@@ -105,11 +105,11 @@ void Conv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order. // in-order.
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \ #define MACE_BM_CONV_2D_MACRO( \
DEVICE) \ N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, DEVICE) \
static void \ static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\
##_##P##_##OC##_##TYPE##_##DEVICE( \ DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \ int64_t pad_h = 0, pad_w = 0; \
...@@ -128,54 +128,53 @@ void Conv2d(int iters, ...@@ -128,54 +128,53 @@ void Conv2d(int iters,
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \ mace::Padding::P, OC); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\ MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\
##_##P##_##OC##_##TYPE##_##DEVICE) DILATION##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ #define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
// Filter sizes and data alignments // Filter sizes and data alignments
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); MACE_BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); MACE_BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128);
// 3 channels input // 3 channels input
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); MACE_BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32);
// Dilations // Dilations
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
// MobileNet // MobileNet
BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); MACE_BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); MACE_BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); MACE_BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); MACE_BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192); MACE_BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192);
BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192); MACE_BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192);
BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Deconv2D(OperatorRegistry *op_registry) { void Register_Deconv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Deconv2dOp<DeviceType::CPU, float>); Deconv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
Deconv2dOp<DeviceType::GPU, float>); Deconv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
Deconv2dOp<DeviceType::GPU, half>); Deconv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> {
kernels::Deconv2dFunctor<D, T> functor_; kernels::Deconv2dFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS); MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -94,11 +94,11 @@ static void Deconv2d(int iters, ...@@ -94,11 +94,11 @@ static void Deconv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order. // in-order.
#define BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, \ #define MACE_BM_DECONV_2D_MACRO( \
DEVICE) \ N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, DEVICE) \
static void \ static void \
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW\ MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\
##_##P##_##OC##_##TYPE##_##DEVICE( \ ##OW##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t oh = OH; \ int64_t oh = OH; \
...@@ -110,30 +110,30 @@ static void Deconv2d(int iters, ...@@ -110,30 +110,30 @@ static void Deconv2d(int iters,
Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \
mace::Padding::P, OC); \ mace::Padding::P, OC); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW##\ MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\
_##P##_##OC##_##TYPE##_##DEVICE) ##OW##_##P##_##OC##_##TYPE##_##DEVICE)
// TODO(liutuo): add cpu benchmark when optimized. // TODO(liutuo): add cpu benchmark when optimized.
#define BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ #define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256);
BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32);
BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);
BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_DepthToSpace(OperatorRegistry *op_registry) { void Register_DepthToSpace(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthToSpaceOp<DeviceType::CPU, float>); DepthToSpaceOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthToSpaceOp<DeviceType::GPU, float>); DepthToSpaceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
DepthToSpaceOp<DeviceType::GPU, half>); DepthToSpaceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator<D, T> { ...@@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator<D, T> {
protected: protected:
const int block_size_; const int block_size_;
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
private: private:
kernels::DepthToSpaceOpFunctor<D, T> functor_; kernels::DepthToSpaceOpFunctor<D, T> functor_;
......
...@@ -69,25 +69,26 @@ void DepthToSpace( ...@@ -69,25 +69,26 @@ void DepthToSpace(
} }
} // namespace } // namespace
#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ #define MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \ static void \
BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \ DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ #define MACE_BM_DEPTH_TO_SPACE(N, C, H, W, G) \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \ MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU); MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::CPU, float>); DepthwiseConv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::GPU, float>); DepthwiseConv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
DepthwiseConv2dOp<DeviceType::GPU, half>); DepthwiseConv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
kernels::DepthwiseConv2dFunctor<D, T> functor_; kernels::DepthwiseConv2dFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS); MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters, ...@@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters,
} }
} // namespace } // namespace
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \ #define MACE_BM_DEPTHWISE_CONV_2D_MACRO( \
DEVICE) \ N, C, H, W, KH, KW, STRIDE, P, M, TYPE, DEVICE) \
static void \ static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\
##P##_##M##_##TYPE##_##DEVICE( \ ##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t dilation = 1; \ const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \ int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \ if (P == SAME) { \
pad_h = KH / 2; \ pad_h = KH / 2; \
pad_w = KW / 2; \ pad_w = KW / 2; \
} \ } \
int64_t oh = \ int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \ int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \ const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \ static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \ mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \ DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \ mace::Padding::P, M); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\ MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\
##P##_##M##_##TYPE##_##DEVICE) ##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ #define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); MACE_BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1);
} // namespace test } // namespace test
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Eltwise(OperatorRegistry *op_registry) { void Register_Eltwise(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
EltwiseOp<DeviceType::CPU, float>); EltwiseOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
EltwiseOp<DeviceType::GPU, float>); EltwiseOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
EltwiseOp<DeviceType::GPU, half>); EltwiseOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -43,7 +43,7 @@ class EltwiseOp : public Operator<D, T> { ...@@ -43,7 +43,7 @@ class EltwiseOp : public Operator<D, T> {
kernels::EltwiseFunctor<D, T> functor_; kernels::EltwiseFunctor<D, T> functor_;
private: private:
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -76,30 +76,31 @@ void EltwiseBenchmark( ...@@ -76,30 +76,31 @@ void EltwiseBenchmark(
} }
} // namespace } // namespace
#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \
static void \ static void \
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \ EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \ iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
} \ } \
BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \ #define MACE_BM_ELTWISE(ELT_TYPE, N, H, W, C) \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \ MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \ MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU); MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
BM_ELTWISE(2, 1, 128, 128, 32); MACE_BM_ELTWISE(2, 1, 128, 128, 32);
BM_ELTWISE(2, 1, 240, 240, 256); MACE_BM_ELTWISE(2, 1, 240, 240, 256);
BM_ELTWISE(2, 1, 256, 256, 32); MACE_BM_ELTWISE(2, 1, 256, 256, 32);
BM_ELTWISE(0, 1, 128, 128, 32); MACE_BM_ELTWISE(0, 1, 128, 128, 32);
BM_ELTWISE(0, 1, 240, 240, 256); MACE_BM_ELTWISE(0, 1, 240, 240, 256);
BM_ELTWISE(5, 1, 128, 128, 32); MACE_BM_ELTWISE(5, 1, 128, 128, 32);
BM_ELTWISE(5, 1, 240, 240, 256); MACE_BM_ELTWISE(5, 1, 240, 240, 256);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::CPU, float>); FoldedBatchNormOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::GPU, float>); FoldedBatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::GPU, half>); FoldedBatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator<D, T> { ...@@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator<D, T> {
kernels::BatchNormFunctor<D, T> functor_; kernels::BatchNormFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_FullyConnected(OperatorRegistry *op_registry) { void Register_FullyConnected(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FullyConnectedOp<DeviceType::CPU, float>); FullyConnectedOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FullyConnectedOp<DeviceType::GPU, float>); FullyConnectedOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
FullyConnectedOp<DeviceType::GPU, half>); FullyConnectedOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator<D, T> { ...@@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator<D, T> {
kernels::FullyConnectedFunctor<D, T> functor_; kernels::FullyConnectedFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -82,28 +82,28 @@ void FCBenchmark( ...@@ -82,28 +82,28 @@ void FCBenchmark(
} }
} // namespace } // namespace
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = \ const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \ static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \ const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \ static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \ mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \ FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \ } \
BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define BM_FC(N, H, W, C, OC) \ #define MACE_BM_FC(N, H, W, C, OC) \
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
BM_FC_MACRO(N, H, W, C, OC, half, GPU); MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU);
BM_FC(1, 16, 16, 32, 32); MACE_BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000); MACE_BM_FC(1, 8, 8, 32, 1000);
BM_FC(1, 2, 2, 512, 2); MACE_BM_FC(1, 2, 2, 512, 2);
BM_FC(1, 7, 7, 512, 2048); MACE_BM_FC(1, 7, 7, 512, 2048);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,17 +18,17 @@ namespace mace { ...@@ -18,17 +18,17 @@ namespace mace {
namespace ops { namespace ops {
void Register_ImageToBuffer(OperatorRegistry *op_registry) { void Register_ImageToBuffer(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ImageToBufferOp<DeviceType::GPU, float>); ImageToBufferOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ImageToBufferOp<DeviceType::GPU, half>); ImageToBufferOp<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator<D, T> { ...@@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator<D, T> {
kernels::ImageToBufferFunctor<D, T> functor_; kernels::ImageToBufferFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,11 +18,11 @@ namespace mace { ...@@ -18,11 +18,11 @@ namespace mace {
namespace ops { namespace ops {
void Register_LocalResponseNorm(OperatorRegistry *op_registry) { void Register_LocalResponseNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
LocalResponseNormOp<DeviceType::CPU, float>); LocalResponseNormOp<DeviceType::CPU, float>);
} }
} // namespace ops } // namespace ops
......
...@@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator<D, T> { ...@@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator<D, T> {
kernels::LocalResponseNormFunctor<D, T> functor_; kernels::LocalResponseNormFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -55,32 +55,34 @@ static void LocalResponseNorm( ...@@ -55,32 +55,34 @@ static void LocalResponseNorm(
net.Sync(); net.Sync();
} }
#define BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ static void \
int iters) { \ MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \ LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_LOCAL_RESPONSE_NORM(N, C, H, W) \ #define MACE_BM_LOCAL_RESPONSE_NORM(N, C, H, W) \
BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU); MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);
BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512); MACE_BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128); MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128);
BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512); MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112); MACE_BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112);
BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256); MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256);
BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512); MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56); MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56);
BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256); MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256);
BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14); MACE_BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14);
BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14); MACE_BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14);
BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7); MACE_BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7);
BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256); MACE_BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256);
BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256); MACE_BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_MatMul(OperatorRegistry *op_registry) { void Register_MatMul(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
MatMulOp<DeviceType::CPU, float>); MatMulOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
MatMulOp<DeviceType::GPU, float>); MatMulOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
MatMulOp<DeviceType::GPU, half>); MatMulOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -67,27 +67,28 @@ void MatMulBenchmark( ...@@ -67,27 +67,28 @@ void MatMulBenchmark(
} }
} // namespace } // namespace
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \ const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \ mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \ MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \ } \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W) \ #define MACE_BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
BM_MATMUL_MACRO(N, H, C, W, half, GPU); MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);
BM_MATMUL(16, 32, 128, 49); MACE_BM_MATMUL(16, 32, 128, 49);
BM_MATMUL(16, 32, 128, 961); MACE_BM_MATMUL(16, 32, 128, 961);
BM_MATMUL(16, 32, 128, 3969); MACE_BM_MATMUL(16, 32, 128, 3969);
BM_MATMUL(16, 128, 128, 49); MACE_BM_MATMUL(16, 128, 128, 49);
BM_MATMUL(16, 128, 128, 961); MACE_BM_MATMUL(16, 128, 128, 961);
BM_MATMUL(16, 128, 128, 3969); MACE_BM_MATMUL(16, 128, 128, 3969);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,23 +18,23 @@ namespace mace { ...@@ -18,23 +18,23 @@ namespace mace {
namespace ops { namespace ops {
void Register_Pad(OperatorRegistry *op_registry) { void Register_Pad(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
PadOp<DeviceType::CPU, float>); PadOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
PadOp<DeviceType::GPU, float>); PadOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
PadOp<DeviceType::GPU, half>); PadOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height, ...@@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height,
} }
} // namespace } // namespace
#define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \ #define MACE_BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \
static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \ Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \ } \
BENCHMARK(BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE)
#define BM_PAD(N, H, W, C, PAD) \ #define MACE_BM_PAD(N, H, W, C, PAD) \
BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \ MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \ MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \
BM_PAD_MACRO(N, H, W, C, PAD, half, GPU); MACE_BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
BM_PAD(1, 512, 512, 1, 2); MACE_BM_PAD(1, 512, 512, 1, 2);
BM_PAD(1, 112, 112, 64, 1); MACE_BM_PAD(1, 112, 112, 64, 1);
BM_PAD(1, 256, 256, 32, 2); MACE_BM_PAD(1, 256, 256, 32, 2);
BM_PAD(1, 512, 512, 16, 2); MACE_BM_PAD(1, 512, 512, 16, 2);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Pooling(OperatorRegistry *op_registry) { void Register_Pooling(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
PoolingOp<DeviceType::CPU, float>); PoolingOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
PoolingOp<DeviceType::GPU, float>); PoolingOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
PoolingOp<DeviceType::GPU, half>); PoolingOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase<D, T> { ...@@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
PoolingType pooling_type_; PoolingType pooling_type_;
kernels::PoolingFunctor<D, T> functor_; kernels::PoolingFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -87,29 +87,29 @@ void Pooling(int iters, ...@@ -87,29 +87,29 @@ void Pooling(int iters,
} }
} // namespace } // namespace
#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \ #define MACE_BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \
static void \ static void \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE( \ ##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \ PoolingType::PO); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\ MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE) ##DEVICE)
#define BM_POOLING(N, C, H, W, K, S, PA, PO) \ #define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU); MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); MACE_BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); MACE_BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX); MACE_BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX); MACE_BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,11 +18,11 @@ namespace mace { ...@@ -18,11 +18,11 @@ namespace mace {
namespace ops { namespace ops {
void Register_Proposal(OperatorRegistry *op_registry) { void Register_Proposal(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ProposalOp<DeviceType::CPU, float>); ProposalOp<DeviceType::CPU, float>);
} }
} // namespace ops } // namespace ops
......
...@@ -49,8 +49,8 @@ class ProposalOp : public Operator<D, T> { ...@@ -49,8 +49,8 @@ class ProposalOp : public Operator<D, T> {
kernels::ProposalFunctor<D, T> functor_; kernels::ProposalFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO);
OP_OUTPUT_TAGS(ROIS); MACE_OP_OUTPUT_TAGS(ROIS);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,11 +18,11 @@ namespace mace { ...@@ -18,11 +18,11 @@ namespace mace {
namespace ops { namespace ops {
void Register_PSROIAlign(OperatorRegistry *op_registry) { void Register_PSROIAlign(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
PSROIAlignOp<DeviceType::CPU, float>); PSROIAlignOp<DeviceType::CPU, float>);
} }
} // namespace ops } // namespace ops
......
...@@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator<D, T> { ...@@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator<D, T> {
kernels::PSROIAlignFunctor<D, T> functor_; kernels::PSROIAlignFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, ROIS); MACE_OP_INPUT_TAGS(INPUT, ROIS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,27 +18,27 @@ namespace mace { ...@@ -18,27 +18,27 @@ namespace mace {
namespace ops { namespace ops {
void Register_Quantize(OperatorRegistry *op_registry) { void Register_Quantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T") .TypeConstraint<uint8_t>("T")
.Build(), .Build(),
QuantizeOp<DeviceType::CPU, uint8_t>); QuantizeOp<DeviceType::CPU, uint8_t>);
} }
void Register_Dequantize(OperatorRegistry *op_registry) { void Register_Dequantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T") .TypeConstraint<uint8_t>("T")
.Build(), .Build(),
DequantizeOp<DeviceType::CPU, uint8_t>); DequantizeOp<DeviceType::CPU, uint8_t>);
} }
void Register_Requantize(OperatorRegistry *op_registry) { void Register_Requantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T") .TypeConstraint<uint8_t>("T")
.Build(), .Build(),
RequantizeOp<DeviceType::CPU, uint8_t>); RequantizeOp<DeviceType::CPU, uint8_t>);
} }
} // namespace ops } // namespace ops
......
...@@ -50,8 +50,8 @@ class QuantizeOp : public Operator<D, T> { ...@@ -50,8 +50,8 @@ class QuantizeOp : public Operator<D, T> {
kernels::QuantizeFunctor<D, T> functor_; kernels::QuantizeFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
}; };
template<DeviceType D, class T> template<DeviceType D, class T>
...@@ -79,8 +79,8 @@ class DequantizeOp : public Operator<D, T> { ...@@ -79,8 +79,8 @@ class DequantizeOp : public Operator<D, T> {
kernels::DequantizeFunctor<D, T> functor_; kernels::DequantizeFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX); MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
template<DeviceType D, class T> template<DeviceType D, class T>
...@@ -131,8 +131,8 @@ class RequantizeOp : public Operator<D, T> { ...@@ -131,8 +131,8 @@ class RequantizeOp : public Operator<D, T> {
kernels::RequantizeFunctor<D, T> functor_; kernels::RequantizeFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX); MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX);
OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX); MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,11 +18,11 @@ namespace mace { ...@@ -18,11 +18,11 @@ namespace mace {
namespace ops { namespace ops {
void Register_Reshape(OperatorRegistry *op_registry) { void Register_Reshape(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ReshapeOp<DeviceType::CPU, float>); ReshapeOp<DeviceType::CPU, float>);
} }
} // namespace ops } // namespace ops
......
...@@ -69,8 +69,8 @@ class ReshapeOp : public Operator<D, T> { ...@@ -69,8 +69,8 @@ class ReshapeOp : public Operator<D, T> {
kernels::ReshapeFunctor<D, T> functor_; kernels::ReshapeFunctor<D, T> functor_;
private: private:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_ResizeBilinear(OperatorRegistry *op_registry) { void Register_ResizeBilinear(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ResizeBilinearOp<DeviceType::CPU, float>); ResizeBilinearOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ResizeBilinearOp<DeviceType::GPU, float>); ResizeBilinearOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ResizeBilinearOp<DeviceType::GPU, half>); ResizeBilinearOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters, ...@@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters,
} }
} // namespace } // namespace
#define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ #define MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \
static void \ static void \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \ ##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \ const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \ mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \ ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE) ##DEVICE)
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ #define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480); MACE_BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480);
MACE_BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15);
BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15); MACE_BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30);
BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30); MACE_BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60);
BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60); MACE_BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480);
BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480); MACE_BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480);
BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480); MACE_BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016);
BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Slice(OperatorRegistry *op_registry) { void Register_Slice(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SliceOp<DeviceType::CPU, float>); SliceOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SliceOp<DeviceType::GPU, float>); SliceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
SliceOp<DeviceType::GPU, half>); SliceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -46,7 +46,7 @@ class SliceOp : public Operator<D, T> { ...@@ -46,7 +46,7 @@ class SliceOp : public Operator<D, T> {
kernels::SliceFunctor<D, T> functor_; kernels::SliceFunctor<D, T> functor_;
private: private:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -73,26 +73,28 @@ void BMSliceHelper(int iters, ...@@ -73,26 +73,28 @@ void BMSliceHelper(int iters,
} }
} // namespace } // namespace
#define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \ #define MACE_BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \
static void \ static void \
BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \ MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \ BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \
} \ } \
BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
#define BM_SLICE(N, H, W, C, NO) \ #define MACE_BM_SLICE(N, H, W, C, NO) \
BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \ MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \ MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \
BM_SLICE_MACRO(N, H, W, C, NO, half, GPU); MACE_BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
BM_SLICE(1, 32, 32, 32, 2); MACE_BM_SLICE(1, 32, 32, 32, 2);
BM_SLICE(1, 32, 32, 128, 2); MACE_BM_SLICE(1, 32, 32, 128, 2);
BM_SLICE(1, 32, 32, 256, 2); MACE_BM_SLICE(1, 32, 32, 256, 2);
BM_SLICE(1, 128, 128, 32, 2); MACE_BM_SLICE(1, 128, 128, 32, 2);
BM_SLICE(1, 128, 128, 128, 2); MACE_BM_SLICE(1, 128, 128, 128, 2);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_Softmax(OperatorRegistry *op_registry) { void Register_Softmax(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SoftmaxOp<DeviceType::CPU, float>); SoftmaxOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SoftmaxOp<DeviceType::GPU, float>); SoftmaxOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
SoftmaxOp<DeviceType::GPU, half>); SoftmaxOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -40,8 +40,8 @@ class SoftmaxOp : public Operator<D, T> { ...@@ -40,8 +40,8 @@ class SoftmaxOp : public Operator<D, T> {
kernels::SoftmaxFunctor<D, T> functor_; kernels::SoftmaxFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(LOGITS); MACE_OP_INPUT_TAGS(LOGITS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -70,26 +70,26 @@ void SoftmaxBenchmark( ...@@ -70,26 +70,26 @@ void SoftmaxBenchmark(
} }
} // namespace } // namespace
#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SOFTMAX(N, C, H, W) \ #define MACE_BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \ MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \ MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
BM_SOFTMAX_MACRO(N, C, H, W, half, GPU); MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
BM_SOFTMAX(1, 2, 512, 512); MACE_BM_SOFTMAX(1, 2, 512, 512);
BM_SOFTMAX(1, 3, 512, 512); MACE_BM_SOFTMAX(1, 3, 512, 512);
BM_SOFTMAX(1, 4, 512, 512); MACE_BM_SOFTMAX(1, 4, 512, 512);
BM_SOFTMAX(1, 10, 256, 256); MACE_BM_SOFTMAX(1, 10, 256, 256);
BM_SOFTMAX(1, 1024, 7, 7); MACE_BM_SOFTMAX(1, 1024, 7, 7);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,23 +18,23 @@ namespace mace { ...@@ -18,23 +18,23 @@ namespace mace {
namespace ops { namespace ops {
void Register_SpaceToBatchND(OperatorRegistry *op_registry) { void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SpaceToBatchNDOp<DeviceType::CPU, float>); SpaceToBatchNDOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SpaceToBatchNDOp<DeviceType::GPU, float>); SpaceToBatchNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
SpaceToBatchNDOp<DeviceType::GPU, half>); SpaceToBatchNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator<D, T> {
kernels::SpaceToBatchFunctor<D, T> functor_; kernels::SpaceToBatchFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -64,27 +64,27 @@ void BMSpaceToBatch( ...@@ -64,27 +64,27 @@ void BMSpaceToBatch(
} }
} // namespace } // namespace
#define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \ #define MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \
static void \ static void \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \ MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \ BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ #define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 16, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 8);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,24 +18,24 @@ namespace mace { ...@@ -18,24 +18,24 @@ namespace mace {
namespace ops { namespace ops {
void Register_SpaceToDepth(OperatorRegistry *op_registry) { void Register_SpaceToDepth(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SpaceToDepthOp<DeviceType::CPU, float>); SpaceToDepthOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SpaceToDepthOp<DeviceType::GPU, float>); SpaceToDepthOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
SpaceToDepthOp<DeviceType::GPU, half>); SpaceToDepthOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator<D, T> { ...@@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator<D, T> {
} }
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
private: private:
kernels::DepthToSpaceOpFunctor<D, T> functor_; kernels::DepthToSpaceOpFunctor<D, T> functor_;
......
...@@ -69,25 +69,26 @@ void SpaceToDepth( ...@@ -69,25 +69,26 @@ void SpaceToDepth(
} }
} // namespace } // namespace
#define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \ #define MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \ static void \
BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \ SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
BENCHMARK(BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \ #define MACE_BM_SPACE_TO_DEPTH(N, C, H, W, G) \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \ MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \ MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU); MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -18,11 +18,11 @@ namespace mace { ...@@ -18,11 +18,11 @@ namespace mace {
namespace ops { namespace ops {
void Register_Transpose(OperatorRegistry *op_registry) { void Register_Transpose(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
TransposeOp<DeviceType::CPU, float>); TransposeOp<DeviceType::CPU, float>);
} }
} // namespace ops } // namespace ops
......
...@@ -50,8 +50,8 @@ class TransposeOp : public Operator<D, T> { ...@@ -50,8 +50,8 @@ class TransposeOp : public Operator<D, T> {
std::vector<int> dims_; std::vector<int> dims_;
kernels::TransposeFunctor<D, T> functor_; kernels::TransposeFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace mace } // namespace mace
......
...@@ -55,41 +55,41 @@ void TransposeBenchmark(int iters, ...@@ -55,41 +55,41 @@ void TransposeBenchmark(int iters,
} }
} // namespace } // namespace
#define BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \ #define MACE_BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \
static void BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \ TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \
} \ } \
BENCHMARK(BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE) MACE_BENCHMARK(MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TRANSPOSE2D(H, W) \ #define MACE_BM_TRANSPOSE2D(H, W) \
BM_TRANSPOSE2D_MACRO(H, W, float, CPU); MACE_BM_TRANSPOSE2D_MACRO(H, W, float, CPU);
#define BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \ #define MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \
static void \ static void \
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE( \ MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
int iters) { \ DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE) MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE)
#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU); #define MACE_BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2); MACE_BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2);
BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1); MACE_BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1);
BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2); MACE_BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2);
BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1); MACE_BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1);
BM_TRANSPOSE2D(128, 128); MACE_BM_TRANSPOSE2D(128, 128);
BM_TRANSPOSE2D(512, 512); MACE_BM_TRANSPOSE2D(512, 512);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -19,17 +19,17 @@ namespace ops { ...@@ -19,17 +19,17 @@ namespace ops {
void Register_WinogradInverseTransform(OperatorRegistry *op_registry) { void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
WinogradInverseTransformOp<DeviceType::GPU, float>); WinogradInverseTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
WinogradInverseTransformOp<DeviceType::GPU, half>); WinogradInverseTransformOp<DeviceType::GPU, half>);
#else #else
MACE_UNUSED(op_registry); MACE_UNUSED(op_registry);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator<D, T> { ...@@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels::WinogradInverseTransformFunctor<D, T> functor_; kernels::WinogradInverseTransformFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT, BIAS); MACE_OP_INPUT_TAGS(INPUT, BIAS);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -19,17 +19,17 @@ namespace ops { ...@@ -19,17 +19,17 @@ namespace ops {
void Register_WinogradTransform(OperatorRegistry *op_registry) { void Register_WinogradTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
WinogradTransformOp<DeviceType::GPU, float>); WinogradTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
WinogradTransformOp<DeviceType::GPU, half>); WinogradTransformOp<DeviceType::GPU, half>);
#else #else
MACE_UNUSED(op_registry); MACE_UNUSED(op_registry);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator<D, T> { ...@@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator<D, T> {
kernels::WinogradTransformFunctor<D, T> functor_; kernels::WinogradTransformFunctor<D, T> functor_;
protected: protected:
OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
}; };
} // namespace ops } // namespace ops
......
...@@ -51,22 +51,24 @@ void BMWinogradTransform( ...@@ -51,22 +51,24 @@ void BMWinogradTransform(
} }
} // namespace } // namespace
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ static void \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \ } \
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \ #define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -103,24 +105,24 @@ void BMWinogradInverseTransform( ...@@ -103,24 +105,24 @@ void BMWinogradInverseTransform(
} }
} // namespace } // namespace
#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \ static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \ } \
BENCHMARK( \ MACE_BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ #define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册