提交 ccaec70c 编写于 作者: L Liangliang He

Merge branch 'add_MACE_prefix_for_macros' into 'master'

add MACE_ prefix for macros

See merge request !517
......@@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC
} // namespace mace
......@@ -124,20 +124,19 @@ class Operator : public OperatorBase {
~Operator() noexcept override {}
};
// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of
// the
// operator's inputs and outputs, in order to avoid confusion. For example, for
// a fully convolution layer that has input, weight and bias, you can define its
// input tags as:
// OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
// indices of the operator's inputs and outputs, in order to avoid confusion.
// For example, for a fully convolution layer that has input, weight and bias,
// you can define its input tags as:
// MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// And in the code, instead of doing
// auto& weight = Input(1);
// you can now do
// auto& weight = Input(WEIGHT);
// to make it more clear.
#define OP_INPUT_TAGS(first_input, ...) \
#define MACE_OP_INPUT_TAGS(first_input, ...) \
enum _InputTags { first_input = 0, __VA_ARGS__ }
#define OP_OUTPUT_TAGS(first_input, ...) \
#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
class OpKeyBuilder {
......@@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry,
const OperatorDef &,
Workspace *);
#define REGISTER_OPERATOR(op_registry, name, ...) \
#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
} // namespace mace
......
......@@ -32,7 +32,7 @@ inline int64_t NowMicros() {
namespace mace {
#define MAX_NODE 2048
#define MACE_MAX_NODE 2048
enum {
NN_GRAPH_PERFEVENT_CYCLES = 0,
......@@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0;
}
#define PRINT_BUFSIZE (2 * 1024 * 1024)
#define MACE_PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() {
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
MACE_PRINT_BUFSIZE) == 0,
"print log error");
LOG(INFO) << std::string(buf);
delete[] buf;
......@@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() {
void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph";
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
MACE_PRINT_BUFSIZE) == 0,
"print graph error");
LOG(INFO) << std::string(buf);
delete[] buf;
......@@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) {
void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
std::vector<hexagon_nn_perfinfo> perf_info(MACE_MAX_NODE);
unsigned int n_items = 0;
MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MACE_MAX_NODE,
&n_items) == 0,
"get perf info error");
......@@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() {
perf_info[i].counter_lo) *
1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
char node_type_buf[MACE_MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MACE_MAX_NODE);
std::string node_type(node_type_buf);
LOG(INFO) << "node id: " << perf_info[i].node_id
<< ", node type: " << node_type
......
......@@ -22,7 +22,7 @@
namespace mace {
#define OP_INVALID -1
#define MACE_OP_INVALID -1
typedef enum op_type_enum {
#define DEF_OP(NAME, ...) OP_##NAME,
......@@ -48,7 +48,7 @@ class OpMap {
return op_map_[op_type];
} else {
LOG(ERROR) << "DSP unsupoorted op type: " << op_type;
return OP_INVALID;
return MACE_OP_INVALID;
}
}
......
......@@ -30,9 +30,9 @@
#ifdef MACE_ENABLE_NEON
// Avoid over-bound accessing memory
#define EXTRA_BUFFER_PAD_SIZE 64
#define MACE_EXTRA_BUFFER_PAD_SIZE 64
#else
#define EXTRA_BUFFER_PAD_SIZE 0
#define MACE_EXTRA_BUFFER_PAD_SIZE 0
#endif
namespace mace {
......@@ -210,16 +210,16 @@ class Tensor {
image_shape_.clear();
if (buffer_ != nullptr) {
MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
<< raw_size() + EXTRA_BUFFER_PAD_SIZE;
return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
<< raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
}
return MaceStatus::MACE_SUCCESS;
} else {
MACE_CHECK(is_buffer_owner_);
buffer_ = new Buffer(allocator_);
return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE);
return buffer_->Allocate(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
}
}
......
......@@ -21,8 +21,8 @@
#include <vector>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \
#define MACE_BENCHMARK(n) \
static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \
__benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n)))
namespace mace {
......
......@@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type)));
MaceStatus status = tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE);
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE);
if (status != MaceStatus::MACE_SUCCESS) {
return status;
}
......
......@@ -21,7 +21,7 @@
namespace mace {
namespace kernels {
#define Conv2dNeonK5x5SnLoadCalc4 \
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \
......@@ -62,7 +62,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define Conv2dNeonK5x5SnLoadCalc1 \
#define MACE_Conv2dNeonK5x5SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00; \
float32x2_t vf01; \
......@@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3);
Conv2dNeonK5x5SnLoadCalc4;
MACE_Conv2dNeonK5x5SnLoadCalc4;
in_offset += in_width;
filter_ptr0 += 5;
......@@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3);
Conv2dNeonK5x5SnLoadCalc1;
MACE_Conv2dNeonK5x5SnLoadCalc1;
in_offset += in_width;
filter_ptr0 += 5;
......
......@@ -21,7 +21,7 @@
namespace mace {
namespace kernels {
#define Conv2dArmv8NeonK7x7SnLoadCalc4 \
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
......@@ -72,7 +72,7 @@ namespace kernels {
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#define Conv2dArmv8NeonK7x7SnLoadCalc1 \
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
......@@ -87,7 +87,7 @@ namespace kernels {
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#define Conv2dArmv7NeonK7x7SnLoadCalc4 \
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
......@@ -138,7 +138,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#define Conv2dArmv7NeonK7x7SnLoadCalc1 \
#define MACE_Conv2dArmv7NeonK7x7SnLoadCalc1 \
/* load filter (1 outch x 1 height x 4 width) */ \
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
......@@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif
in_offset += in_width;
......@@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif
in_offset += in_width;
......@@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif
in_offset += in_width;
......@@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif
in_offset += in_width;
......@@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif
in_offset += in_width;
......@@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1;
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif
in_offset += in_width;
......
......@@ -465,7 +465,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
padded_input_size =
batch * input_channels * (input_height + pad_top + pad_bottom)
* (input_width + pad_left + pad_right) * sizeof(float) +
EXTRA_BUFFER_PAD_SIZE;
MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
}
if (extra_output_height != height || extra_output_width != width) {
......
......@@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
// Skip the padded top rows
if (padding_same_value) {
#define COPY_INPUT \
#define MACE_COPY_INPUT \
std::fill(output_data, output_data + padded_left, input[0]); \
output_data += padded_left; \
memcpy(output_data, input, width * sizeof(float)); \
......@@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < padded_top; ++k) {
COPY_INPUT;
MACE_COPY_INPUT;
}
for (int k = 0; k < height; ++k) {
COPY_INPUT;
MACE_COPY_INPUT;
input += width;
}
input -= width;
for (int k = 0; k < padded_bottom; ++k) {
COPY_INPUT;
MACE_COPY_INPUT;
}
input += width;
}
}
#undef COPY_INPUT
#undef MACE_COPY_INPUT
} else {
output_data += padded_top * output_width;
for (int i = 0; i < batch; ++i) {
......
......@@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(stride > 0, "stride should > 0.");
#define WIDTH_BLK 5
#define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride - 1) / stride;
const index_t width_blocks = ((n_strides + WIDTH_BLK -1)/ WIDTH_BLK) * stride;
const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK -1)/ MACE_WIDTH_BLK) * stride;
const float stride_r = 1.f / static_cast<float>(stride);
const int padding_h = (paddings[0]+1) >> 1;
const int padding_w = (paddings[0]+1) >> 1;
......
......@@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class OutOfRangeCheckTest : public ::testing::Test {
protected:
virtual void SetUp() {
setenv("MACE_OUT_OF_RANGE_CHECK", "1", 1);
setenv("OUT_OF_RANGE_CHECK", "1", 1);
}
};
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Activation(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ActivationOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ActivationOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
}
} // namespace
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, GPU); \
BM_RELU_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_RELU(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \
MACE_BM_RELU_MACRO(N, C, H, W, half, GPU);
BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128);
BM_RELU(1, 3, 512, 512);
BM_RELU(1, 32, 112, 112);
BM_RELU(1, 64, 256, 256);
MACE_BM_RELU(1, 1, 512, 512);
MACE_BM_RELU(1, 3, 128, 128);
MACE_BM_RELU(1, 3, 512, 512);
MACE_BM_RELU(1, 32, 112, 112);
MACE_BM_RELU(1, 64, 256, 256);
namespace {
template <DeviceType D, typename T>
......@@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
}
} // namespace
#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, GPU); \
BM_RELUX_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_RELUX(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \
MACE_BM_RELUX_MACRO(N, C, H, W, half, GPU);
BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128);
BM_RELUX(1, 3, 512, 512);
BM_RELUX(1, 32, 112, 112);
BM_RELUX(1, 64, 256, 256);
MACE_BM_RELUX(1, 1, 512, 512);
MACE_BM_RELUX(1, 3, 128, 128);
MACE_BM_RELUX(1, 3, 512, 512);
MACE_BM_RELUX(1, 32, 112, 112);
MACE_BM_RELUX(1, 64, 256, 256);
namespace {
template <DeviceType D, typename T>
......@@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
}
} // namespace
#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, GPU); \
BM_PRELU_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_PRELU(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \
MACE_BM_PRELU_MACRO(N, C, H, W, half, GPU);
BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128);
BM_PRELU(1, 3, 512, 512);
BM_PRELU(1, 32, 112, 112);
BM_PRELU(1, 64, 256, 256);
MACE_BM_PRELU(1, 1, 512, 512);
MACE_BM_PRELU(1, 3, 128, 128);
MACE_BM_PRELU(1, 3, 512, 512);
MACE_BM_PRELU(1, 32, 112, 112);
MACE_BM_PRELU(1, 64, 256, 256);
namespace {
template <DeviceType D, typename T>
......@@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
}
} // namespace
#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, GPU); \
BM_TANH_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_TANH(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \
MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \
MACE_BM_TANH_MACRO(N, C, H, W, half, GPU);
BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128);
BM_TANH(1, 3, 512, 512);
BM_TANH(1, 32, 112, 112);
BM_TANH(1, 64, 256, 256);
MACE_BM_TANH(1, 1, 512, 512);
MACE_BM_TANH(1, 3, 128, 128);
MACE_BM_TANH(1, 3, 512, 512);
MACE_BM_TANH(1, 32, 112, 112);
MACE_BM_TANH(1, 64, 256, 256);
namespace {
template <DeviceType D, typename T>
......@@ -343,26 +347,26 @@ void SigmoidBenchmark(
}
} // namespace
#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
BM_SIGMOID(1, 1, 512, 512);
BM_SIGMOID(1, 3, 128, 128);
BM_SIGMOID(1, 3, 512, 512);
BM_SIGMOID(1, 32, 112, 112);
BM_SIGMOID(1, 64, 256, 256);
#define MACE_BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
MACE_BENCHMARK(MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_SIGMOID(N, C, H, W) \
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
MACE_BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
MACE_BM_SIGMOID(1, 1, 512, 512);
MACE_BM_SIGMOID(1, 3, 128, 128);
MACE_BM_SIGMOID(1, 3, 512, 512);
MACE_BM_SIGMOID(1, 32, 112, 112);
MACE_BM_SIGMOID(1, 64, 256, 256);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_AddN(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
AddNOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
AddNOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
}
} // namespace
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
#define MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
#define MACE_BM_ADDN(INPUTS, N, H, W, C) \
MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
BM_ADDN(2, 1, 256, 256, 32);
BM_ADDN(2, 1, 128, 128, 32);
BM_ADDN(4, 1, 128, 128, 3);
BM_ADDN(2, 1, 256, 256, 3);
BM_ADDN(2, 1, 512, 512, 3);
MACE_BM_ADDN(2, 1, 256, 256, 32);
MACE_BM_ADDN(2, 1, 128, 128, 32);
MACE_BM_ADDN(4, 1, 128, 128, 3);
MACE_BM_ADDN(2, 1, 256, 256, 3);
MACE_BM_ADDN(2, 1, 512, 512, 3);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_BatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -60,8 +60,8 @@ class BatchNormOp : public Operator<D, T> {
kernels::BatchNormFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -95,34 +95,34 @@ void BatchNorm(
}
} // namespace
#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
MACE_BENCHMARK(MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_BATCH_NORM(N, C, H, W) \
MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
MACE_BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
BM_BATCH_NORM(1, 1, 512, 512);
BM_BATCH_NORM(1, 3, 128, 128);
BM_BATCH_NORM(1, 3, 512, 512);
BM_BATCH_NORM(1, 32, 112, 112);
BM_BATCH_NORM(1, 64, 256, 256);
BM_BATCH_NORM(1, 64, 512, 512);
BM_BATCH_NORM(1, 128, 56, 56);
BM_BATCH_NORM(1, 128, 256, 256);
BM_BATCH_NORM(1, 256, 14, 14);
BM_BATCH_NORM(1, 512, 14, 14);
BM_BATCH_NORM(1, 1024, 7, 7);
BM_BATCH_NORM(32, 1, 256, 256);
BM_BATCH_NORM(32, 3, 256, 256);
MACE_BM_BATCH_NORM(1, 1, 512, 512);
MACE_BM_BATCH_NORM(1, 3, 128, 128);
MACE_BM_BATCH_NORM(1, 3, 512, 512);
MACE_BM_BATCH_NORM(1, 32, 112, 112);
MACE_BM_BATCH_NORM(1, 64, 256, 256);
MACE_BM_BATCH_NORM(1, 64, 512, 512);
MACE_BM_BATCH_NORM(1, 128, 56, 56);
MACE_BM_BATCH_NORM(1, 128, 256, 256);
MACE_BM_BATCH_NORM(1, 256, 14, 14);
MACE_BM_BATCH_NORM(1, 512, 14, 14);
MACE_BM_BATCH_NORM(1, 1024, 7, 7);
MACE_BM_BATCH_NORM(32, 1, 256, 256);
MACE_BM_BATCH_NORM(32, 3, 256, 256);
} // namespace test
} // namespace ops
......
......@@ -18,22 +18,22 @@ namespace mace {
namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
kernels::SpaceToBatchFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -64,25 +64,26 @@ void BMBatchToSpace(
}
} // namespace
#define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \
static void \
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \
static void \
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
MACE_BENCHMARK( \
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
#define MACE_BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
BM_BATCH_TO_SPACE(64, 32, 32, 32, 8);
MACE_BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
MACE_BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
MACE_BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
MACE_BM_BATCH_TO_SPACE(64, 32, 32, 32, 8);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_BiasAdd(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BiasAddOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BiasAddOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -46,8 +46,8 @@ class BiasAddOp : public Operator<D, T> {
kernels::BiasAddFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
}
} // namespace
#define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
MACE_BENCHMARK(MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_BIAS_ADD(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
BM_BIAS_ADD(1, 1, 512, 512);
BM_BIAS_ADD(1, 3, 128, 128);
BM_BIAS_ADD(1, 3, 512, 512);
BM_BIAS_ADD(1, 32, 112, 112);
BM_BIAS_ADD(1, 64, 256, 256);
BM_BIAS_ADD(1, 64, 512, 512);
BM_BIAS_ADD(1, 128, 56, 56);
BM_BIAS_ADD(1, 128, 256, 256);
BM_BIAS_ADD(1, 256, 14, 14);
BM_BIAS_ADD(1, 512, 14, 14);
BM_BIAS_ADD(1, 1024, 7, 7);
BM_BIAS_ADD(32, 1, 256, 256);
BM_BIAS_ADD(32, 3, 256, 256);
MACE_BM_BIAS_ADD(1, 1, 512, 512);
MACE_BM_BIAS_ADD(1, 3, 128, 128);
MACE_BM_BIAS_ADD(1, 3, 512, 512);
MACE_BM_BIAS_ADD(1, 32, 112, 112);
MACE_BM_BIAS_ADD(1, 64, 256, 256);
MACE_BM_BIAS_ADD(1, 64, 512, 512);
MACE_BM_BIAS_ADD(1, 128, 56, 56);
MACE_BM_BIAS_ADD(1, 128, 256, 256);
MACE_BM_BIAS_ADD(1, 256, 14, 14);
MACE_BM_BIAS_ADD(1, 512, 14, 14);
MACE_BM_BIAS_ADD(1, 1024, 7, 7);
MACE_BM_BIAS_ADD(32, 1, 256, 256);
MACE_BM_BIAS_ADD(32, 3, 256, 256);
} // namespace test
} // namespace ops
......
......@@ -18,17 +18,17 @@ namespace mace {
namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BufferToImageOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
BufferToImageOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::GPU, half>);
}
} // namespace ops
......
......@@ -42,8 +42,8 @@ class BufferToImageOp : public Operator<D, T> {
kernels::BufferToImageFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -54,36 +54,36 @@ void FilterBufferToImage(int iters,
}
} // namespace
#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \
static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
#define MACE_BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \
static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \
} \
BENCHMARK(BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_B2I(O, I, H, W) \
BM_B2I_MACRO(O, I, H, W, float, GPU); \
BM_B2I_MACRO(O, I, H, W, half, GPU);
#define MACE_BM_B2I(O, I, H, W) \
MACE_BM_B2I_MACRO(O, I, H, W, float, GPU); \
MACE_BM_B2I_MACRO(O, I, H, W, half, GPU);
BM_B2I(5, 3, 3, 3);
BM_B2I(5, 3, 7, 7);
BM_B2I(32, 16, 1, 1);
BM_B2I(32, 16, 3, 3);
BM_B2I(32, 16, 5, 5);
BM_B2I(32, 16, 7, 7);
BM_B2I(64, 32, 1, 1);
BM_B2I(64, 32, 3, 3);
BM_B2I(64, 32, 5, 5);
BM_B2I(64, 32, 7, 7);
BM_B2I(128, 64, 1, 1);
BM_B2I(128, 64, 3, 3);
BM_B2I(128, 32, 1, 1);
BM_B2I(128, 32, 3, 3);
BM_B2I(256, 32, 1, 1);
BM_B2I(256, 32, 3, 3);
MACE_BM_B2I(5, 3, 3, 3);
MACE_BM_B2I(5, 3, 7, 7);
MACE_BM_B2I(32, 16, 1, 1);
MACE_BM_B2I(32, 16, 3, 3);
MACE_BM_B2I(32, 16, 5, 5);
MACE_BM_B2I(32, 16, 7, 7);
MACE_BM_B2I(64, 32, 1, 1);
MACE_BM_B2I(64, 32, 3, 3);
MACE_BM_B2I(64, 32, 5, 5);
MACE_BM_B2I(64, 32, 7, 7);
MACE_BM_B2I(128, 64, 1, 1);
MACE_BM_B2I(128, 64, 3, 3);
MACE_BM_B2I(128, 32, 1, 1);
MACE_BM_B2I(128, 32, 3, 3);
MACE_BM_B2I(256, 32, 1, 1);
MACE_BM_B2I(256, 32, 3, 3);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_ChannelShuffle(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ChannelShuffleOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ChannelShuffleOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ChannelShuffleOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator<D, T> {
protected:
const int group_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
private:
kernels::ChannelShuffleFunctor<D, T> functor_;
......
......@@ -69,25 +69,26 @@ void ChannelShuffle(
}
} // namespace
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
#define MACE_BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8);
MACE_BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
MACE_BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
MACE_BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Concat(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ConcatOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ConcatOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ConcatOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -51,7 +51,7 @@ class ConcatOp : public Operator<D, T> {
kernels::ConcatFunctor<D, T> functor_;
private:
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) {
}
} // namespace
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
#define MACE_BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void MACE_BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
MACE_BENCHMARK(MACE_BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO(0, 1000);
BM_CONCAT_CPU_MACRO(0, 100000);
BM_CONCAT_CPU_MACRO(1, 1000);
BM_CONCAT_CPU_MACRO(1, 100000);
MACE_BM_CONCAT_CPU_MACRO(0, 1000);
MACE_BM_CONCAT_CPU_MACRO(0, 100000);
MACE_BM_CONCAT_CPU_MACRO(1, 1000);
MACE_BM_CONCAT_CPU_MACRO(1, 100000);
namespace {
template <typename T>
......@@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters,
}
} // namespace
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half);
#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, float);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, float);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, float);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, half);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half);
MACE_BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Conv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
kernels::Conv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -105,11 +105,11 @@ void Conv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \
DEVICE) \
#define MACE_BM_CONV_2D_MACRO( \
N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE( \
MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\
DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
......@@ -128,54 +128,53 @@ void Conv2d(int iters,
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\
DILATION##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
// Filter sizes and data alignments
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128);
MACE_BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128);
MACE_BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128);
MACE_BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128);
// 3 channels input
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32);
MACE_BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3);
MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32);
MACE_BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32);
// Dilations
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
MACE_BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
// MobileNet
BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
MACE_BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
MACE_BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
MACE_BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
MACE_BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192);
BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192);
BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 192, 17, 17, 1, 7, 1, 1, SAME, 192);
MACE_BM_CONV_2D(1, 192, 17, 17, 7, 1, 1, 1, SAME, 192);
MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Deconv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
Deconv2dOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
Deconv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Deconv2dOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
Deconv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Deconv2dOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
Deconv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> {
kernels::Deconv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -94,11 +94,11 @@ static void Deconv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, \
DEVICE) \
#define MACE_BM_DECONV_2D_MACRO( \
N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, DEVICE) \
static void \
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW\
##_##P##_##OC##_##TYPE##_##DEVICE( \
MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\
##OW##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t oh = OH; \
......@@ -110,30 +110,30 @@ static void Deconv2d(int iters,
Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW##\
_##P##_##OC##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_\
##OW##_##P##_##OC##_##TYPE##_##DEVICE)
// TODO(liutuo): add cpu benchmark when optimized.
#define BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
#define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256);
BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256);
MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32);
MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32);
BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);
MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);
BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_DepthToSpace(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
DepthToSpaceOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
DepthToSpaceOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthToSpaceOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthToSpaceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthToSpaceOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthToSpaceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator<D, T> {
protected:
const int block_size_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
private:
kernels::DepthToSpaceOpFunctor<D, T> functor_;
......
......@@ -69,25 +69,26 @@ void DepthToSpace(
}
} // namespace
#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
#define MACE_BM_DEPTH_TO_SPACE(N, C, H, W, G) \
MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4);
MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
kernels::DepthwiseConv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters,
}
} // namespace
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_DEPTHWISE_CONV_2D_MACRO( \
N, C, H, W, KH, KW, STRIDE, P, M, TYPE, DEVICE) \
static void \
MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\
##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
MACE_BENCHMARK( \
MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\
##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1);
} // namespace test
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Eltwise(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
EltwiseOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
EltwiseOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
EltwiseOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
EltwiseOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
EltwiseOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
EltwiseOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -43,7 +43,7 @@ class EltwiseOp : public Operator<D, T> {
kernels::EltwiseFunctor<D, T> functor_;
private:
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -76,30 +76,31 @@ void EltwiseBenchmark(
}
} // namespace
#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \
static void \
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
} \
BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
} \
MACE_BENCHMARK( \
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
#define MACE_BM_ELTWISE(ELT_TYPE, N, H, W, C) \
MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
BM_ELTWISE(2, 1, 128, 128, 32);
BM_ELTWISE(2, 1, 240, 240, 256);
BM_ELTWISE(2, 1, 256, 256, 32);
BM_ELTWISE(0, 1, 128, 128, 32);
BM_ELTWISE(0, 1, 240, 240, 256);
BM_ELTWISE(5, 1, 128, 128, 32);
BM_ELTWISE(5, 1, 240, 240, 256);
MACE_BM_ELTWISE(2, 1, 128, 128, 32);
MACE_BM_ELTWISE(2, 1, 240, 240, 256);
MACE_BM_ELTWISE(2, 1, 256, 256, 32);
MACE_BM_ELTWISE(0, 1, 128, 128, 32);
MACE_BM_ELTWISE(0, 1, 240, 240, 256);
MACE_BM_ELTWISE(5, 1, 128, 128, 32);
MACE_BM_ELTWISE(5, 1, 240, 240, 256);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator<D, T> {
kernels::BatchNormFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_FullyConnected(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FullyConnectedOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
FullyConnectedOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator<D, T> {
kernels::FullyConnectedFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -82,28 +82,28 @@ void FCBenchmark(
}
} // namespace
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
MACE_BENCHMARK(MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define BM_FC(N, H, W, C, OC) \
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
BM_FC_MACRO(N, H, W, C, OC, half, GPU);
#define MACE_BM_FC(N, H, W, C, OC) \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU);
BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000);
BM_FC(1, 2, 2, 512, 2);
BM_FC(1, 7, 7, 512, 2048);
MACE_BM_FC(1, 16, 16, 32, 32);
MACE_BM_FC(1, 8, 8, 32, 1000);
MACE_BM_FC(1, 2, 2, 512, 2);
MACE_BM_FC(1, 7, 7, 512, 2048);
} // namespace test
} // namespace ops
......
......@@ -18,17 +18,17 @@ namespace mace {
namespace ops {
void Register_ImageToBuffer(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ImageToBufferOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ImageToBufferOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ImageToBufferOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ImageToBufferOp<DeviceType::GPU, half>);
}
} // namespace ops
......
......@@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator<D, T> {
kernels::ImageToBufferFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -18,11 +18,11 @@ namespace mace {
namespace ops {
void Register_LocalResponseNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
LocalResponseNormOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
LocalResponseNormOp<DeviceType::CPU, float>);
}
} // namespace ops
......
......@@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator<D, T> {
kernels::LocalResponseNormFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -55,32 +55,34 @@ static void LocalResponseNorm(
net.Sync();
}
#define BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \
#define MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void \
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_LOCAL_RESPONSE_NORM(N, C, H, W) \
BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);
#define MACE_BM_LOCAL_RESPONSE_NORM(N, C, H, W) \
MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);
BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128);
BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112);
BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256);
BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512);
BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56);
BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256);
BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14);
BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14);
BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7);
BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256);
BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256);
MACE_BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512);
MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128);
MACE_BM_LOCAL_RESPONSE_NORM(1, 3, 512, 512);
MACE_BM_LOCAL_RESPONSE_NORM(1, 32, 112, 112);
MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 256, 256);
MACE_BM_LOCAL_RESPONSE_NORM(1, 64, 512, 512);
MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 56, 56);
MACE_BM_LOCAL_RESPONSE_NORM(1, 128, 256, 256);
MACE_BM_LOCAL_RESPONSE_NORM(1, 256, 14, 14);
MACE_BM_LOCAL_RESPONSE_NORM(1, 512, 14, 14);
MACE_BM_LOCAL_RESPONSE_NORM(1, 1024, 7, 7);
MACE_BM_LOCAL_RESPONSE_NORM(32, 1, 256, 256);
MACE_BM_LOCAL_RESPONSE_NORM(32, 3, 256, 256);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_MatMul(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
MatMulOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
MatMulOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
MatMulOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -67,27 +67,28 @@ void MatMulBenchmark(
}
} // namespace
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
BM_MATMUL_MACRO(N, H, C, W, half, GPU);
#define MACE_BM_MATMUL(N, H, C, W) \
MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);
BM_MATMUL(16, 32, 128, 49);
BM_MATMUL(16, 32, 128, 961);
BM_MATMUL(16, 32, 128, 3969);
BM_MATMUL(16, 128, 128, 49);
BM_MATMUL(16, 128, 128, 961);
BM_MATMUL(16, 128, 128, 3969);
MACE_BM_MATMUL(16, 32, 128, 49);
MACE_BM_MATMUL(16, 32, 128, 961);
MACE_BM_MATMUL(16, 32, 128, 3969);
MACE_BM_MATMUL(16, 128, 128, 49);
MACE_BM_MATMUL(16, 128, 128, 961);
MACE_BM_MATMUL(16, 128, 128, 3969);
} // namespace test
} // namespace ops
......
......@@ -18,23 +18,23 @@ namespace mace {
namespace ops {
void Register_Pad(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PadOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PadOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PadOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PadOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PadOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PadOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height,
}
} // namespace
#define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \
static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \
BENCHMARK(BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE)
#define MACE_BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE) \
static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \
MACE_BENCHMARK(MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE)
#define BM_PAD(N, H, W, C, PAD) \
BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \
BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
#define MACE_BM_PAD(N, H, W, C, PAD) \
MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \
MACE_BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
BM_PAD(1, 512, 512, 1, 2);
BM_PAD(1, 112, 112, 64, 1);
BM_PAD(1, 256, 256, 32, 2);
BM_PAD(1, 512, 512, 16, 2);
MACE_BM_PAD(1, 512, 512, 1, 2);
MACE_BM_PAD(1, 112, 112, 64, 1);
MACE_BM_PAD(1, 256, 256, 32, 2);
MACE_BM_PAD(1, 512, 512, 16, 2);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Pooling(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
PoolingType pooling_type_;
kernels::PoolingFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -87,29 +87,29 @@ void Pooling(int iters,
}
} // namespace
#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \
static void \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
} \
BENCHMARK( \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
#define MACE_BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \
static void \
MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
} \
MACE_BENCHMARK( \
MACE_BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE)
#define BM_POOLING(N, C, H, W, K, S, PA, PO) \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
#define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX);
MACE_BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
MACE_BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);
MACE_BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX);
MACE_BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX);
} // namespace test
} // namespace ops
......
......@@ -18,11 +18,11 @@ namespace mace {
namespace ops {
void Register_Proposal(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ProposalOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ProposalOp<DeviceType::CPU, float>);
}
} // namespace ops
......
......@@ -49,8 +49,8 @@ class ProposalOp : public Operator<D, T> {
kernels::ProposalFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO);
OP_OUTPUT_TAGS(ROIS);
MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO);
MACE_OP_OUTPUT_TAGS(ROIS);
};
} // namespace ops
......
......@@ -18,11 +18,11 @@ namespace mace {
namespace ops {
void Register_PSROIAlign(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PSROIAlignOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
PSROIAlignOp<DeviceType::CPU, float>);
}
} // namespace ops
......
......@@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator<D, T> {
kernels::PSROIAlignFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, ROIS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, ROIS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -18,27 +18,27 @@ namespace mace {
namespace ops {
void Register_Quantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
QuantizeOp<DeviceType::CPU, uint8_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
QuantizeOp<DeviceType::CPU, uint8_t>);
}
void Register_Dequantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
DequantizeOp<DeviceType::CPU, uint8_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
DequantizeOp<DeviceType::CPU, uint8_t>);
}
void Register_Requantize(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
RequantizeOp<DeviceType::CPU, uint8_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
RequantizeOp<DeviceType::CPU, uint8_t>);
}
} // namespace ops
......
......@@ -50,8 +50,8 @@ class QuantizeOp : public Operator<D, T> {
kernels::QuantizeFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
};
template<DeviceType D, class T>
......@@ -79,8 +79,8 @@ class DequantizeOp : public Operator<D, T> {
kernels::DequantizeFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
template<DeviceType D, class T>
......@@ -131,8 +131,8 @@ class RequantizeOp : public Operator<D, T> {
kernels::RequantizeFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX);
OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
MACE_OP_INPUT_TAGS(INPUT, IN_MIN, IN_MAX, RERANGE_MIN, RERANGE_MAX);
MACE_OP_OUTPUT_TAGS(OUTPUT, OUT_MIN, OUT_MAX);
};
} // namespace ops
......
......@@ -18,11 +18,11 @@ namespace mace {
namespace ops {
void Register_Reshape(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ReshapeOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ReshapeOp<DeviceType::CPU, float>);
}
} // namespace ops
......
......@@ -69,8 +69,8 @@ class ReshapeOp : public Operator<D, T> {
kernels::ReshapeFunctor<D, T> functor_;
private:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_ResizeBilinear(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
ResizeBilinearOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ResizeBilinearOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
ResizeBilinearOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters,
}
} // namespace
#define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \
static void \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
BENCHMARK( \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
#define MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \
static void \
MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
MACE_BENCHMARK( \
MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE)
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
#define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480);
BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15);
BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30);
BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60);
BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480);
BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480);
BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016);
MACE_BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480);
MACE_BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15);
MACE_BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30);
MACE_BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60);
MACE_BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480);
MACE_BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480);
MACE_BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Slice(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SliceOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SliceOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -46,7 +46,7 @@ class SliceOp : public Operator<D, T> {
kernels::SliceFunctor<D, T> functor_;
private:
OP_INPUT_TAGS(INPUT);
MACE_OP_INPUT_TAGS(INPUT);
};
} // namespace ops
......
......@@ -73,26 +73,28 @@ void BMSliceHelper(int iters,
}
} // namespace
#define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \
#define MACE_BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \
static void \
BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \
MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \
} \
BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
#define BM_SLICE(N, H, W, C, NO) \
BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \
BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
#define MACE_BM_SLICE(N, H, W, C, NO) \
MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \
MACE_BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
BM_SLICE(1, 32, 32, 32, 2);
BM_SLICE(1, 32, 32, 128, 2);
BM_SLICE(1, 32, 32, 256, 2);
BM_SLICE(1, 128, 128, 32, 2);
BM_SLICE(1, 128, 128, 128, 2);
MACE_BM_SLICE(1, 32, 32, 32, 2);
MACE_BM_SLICE(1, 32, 32, 128, 2);
MACE_BM_SLICE(1, 32, 32, 256, 2);
MACE_BM_SLICE(1, 128, 128, 32, 2);
MACE_BM_SLICE(1, 128, 128, 128, 2);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_Softmax(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SoftmaxOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SoftmaxOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -40,8 +40,8 @@ class SoftmaxOp : public Operator<D, T> {
kernels::SoftmaxFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(LOGITS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(LOGITS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -70,26 +70,26 @@ void SoftmaxBenchmark(
}
} // namespace
#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
MACE_BENCHMARK(MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
#define MACE_BM_SOFTMAX(N, C, H, W) \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
BM_SOFTMAX(1, 2, 512, 512);
BM_SOFTMAX(1, 3, 512, 512);
BM_SOFTMAX(1, 4, 512, 512);
BM_SOFTMAX(1, 10, 256, 256);
BM_SOFTMAX(1, 1024, 7, 7);
MACE_BM_SOFTMAX(1, 2, 512, 512);
MACE_BM_SOFTMAX(1, 3, 512, 512);
MACE_BM_SOFTMAX(1, 4, 512, 512);
MACE_BM_SOFTMAX(1, 10, 256, 256);
MACE_BM_SOFTMAX(1, 1024, 7, 7);
} // namespace test
} // namespace ops
......
......@@ -18,23 +18,23 @@ namespace mace {
namespace ops {
void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator<D, T> {
kernels::SpaceToBatchFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -64,27 +64,27 @@ void BMSpaceToBatch(
}
} // namespace
#define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \
static void \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
BENCHMARK( \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE) \
static void \
MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
MACE_BENCHMARK( \
MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
#define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 16, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 8);
MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 16, 2);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 8);
} // namespace test
} // namespace ops
......
......@@ -18,24 +18,24 @@ namespace mace {
namespace ops {
void Register_SpaceToDepth(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToDepthOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToDepthOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToDepthOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
SpaceToDepthOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToDepthOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
SpaceToDepthOp<DeviceType::GPU, half>);
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator<D, T> {
}
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
private:
kernels::DepthToSpaceOpFunctor<D, T> functor_;
......
......@@ -69,25 +69,26 @@ void SpaceToDepth(
}
} // namespace
#define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
#define MACE_BM_SPACE_TO_DEPTH(N, C, H, W, G) \
MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \
MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4);
MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4);
} // namespace test
} // namespace ops
......
......@@ -18,11 +18,11 @@ namespace mace {
namespace ops {
void Register_Transpose(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
TransposeOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
TransposeOp<DeviceType::CPU, float>);
}
} // namespace ops
......
......@@ -50,8 +50,8 @@ class TransposeOp : public Operator<D, T> {
std::vector<int> dims_;
kernels::TransposeFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
......
......@@ -55,41 +55,41 @@ void TransposeBenchmark(int iters,
}
} // namespace
#define BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \
static void BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \
#define MACE_BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE) \
static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \
} \
BENCHMARK(BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TRANSPOSE2D(H, W) \
BM_TRANSPOSE2D_MACRO(H, W, float, CPU);
#define BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \
static void \
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \
BENCHMARK( \
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE)
#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2);
BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1);
BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2);
BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1);
BM_TRANSPOSE2D(128, 128);
BM_TRANSPOSE2D(512, 512);
MACE_BENCHMARK(MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_TRANSPOSE2D(H, W) \
MACE_BM_TRANSPOSE2D_MACRO(H, W, float, CPU);
#define MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE) \
static void \
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \
MACE_BENCHMARK( \
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE)
#define MACE_BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
MACE_BM_TRANSPOSE4D(1, 512, 512, 3, 0, 3, 1, 2);
MACE_BM_TRANSPOSE4D(1, 2, 512, 512, 0, 2, 3, 1);
MACE_BM_TRANSPOSE4D(1, 64, 64, 512, 0, 3, 1, 2);
MACE_BM_TRANSPOSE4D(1, 512, 64, 64, 0, 2, 3, 1);
MACE_BM_TRANSPOSE2D(128, 128);
MACE_BM_TRANSPOSE2D(512, 512);
} // namespace test
} // namespace ops
......
......@@ -19,17 +19,17 @@ namespace ops {
void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradInverseTransformOp<DeviceType::GPU, half>);
#else
MACE_UNUSED(op_registry);
#endif // MACE_ENABLE_OPENCL
......
......@@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels::WinogradInverseTransformFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT, BIAS);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -19,17 +19,17 @@ namespace ops {
void Register_WinogradTransform(OperatorRegistry *op_registry) {
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradTransformOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
WinogradTransformOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradTransformOp<DeviceType::GPU, half>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
WinogradTransformOp<DeviceType::GPU, half>);
#else
MACE_UNUSED(op_registry);
#endif // MACE_ENABLE_OPENCL
......
......@@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator<D, T> {
kernels::WinogradTransformFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
MACE_OP_INPUT_TAGS(INPUT);
MACE_OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
......
......@@ -51,22 +51,24 @@ void BMWinogradTransform(
}
} // namespace
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \
MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
namespace {
template <DeviceType D, typename T>
......@@ -103,24 +105,24 @@ void BMWinogradInverseTransform(
}
} // namespace
#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
} // namespace test
} // namespace ops
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册