diff --git a/mace/kernels/batch_to_space.h b/mace/kernels/batch_to_space.h index 8198d5766406afcb8365724057173196b1bc87de..a88959d26fe6643cf43dc6061c9e07bad2354669 100644 --- a/mace/kernels/batch_to_space.h +++ b/mace/kernels/batch_to_space.h @@ -35,12 +35,12 @@ struct BatchToSpaceFunctorBase : OpKernel { BatchToSpaceFunctorBase(OpKernelContext *context, const std::vector &paddings, const std::vector &block_shape) - : OpKernel(context), - paddings_(paddings.begin(), paddings.end()), - block_shape_(block_shape.begin(), block_shape.end()) { + : OpKernel(context), + paddings_(paddings.begin(), paddings.end()), + block_shape_(block_shape.begin(), block_shape.end()) { MACE_CHECK( - block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, - "Block's shape should be 1D, and greater than 1"); + block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, + "Block's shape should be 1D, and greater than 1"); MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); } @@ -94,11 +94,11 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { BatchToSpaceFunctor(OpKernelContext *context, const std::vector &paddings, const std::vector &block_shape) - : BatchToSpaceFunctorBase(context, paddings, block_shape) {} + : BatchToSpaceFunctorBase(context, paddings, block_shape) {} MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future) { + Tensor *batch_tensor, + StatsFuture *future) { MACE_UNUSED(future); std::vector output_shape(4, 0); @@ -107,8 +107,8 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { output_shape.data()); MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(space_tensor); - Tensor::MappingGuard output_guard(batch_tensor); + Tensor::MappingGuard input_guard(batch_tensor); + Tensor::MappingGuard output_guard(space_tensor); int pad_top = paddings_[0]; int pad_left = paddings_[2]; @@ -129,8 +129,8 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { // 32k/sizeof(float)/out_width/block_shape index_t - block_h_size = - std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); + block_h_size = + std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); // make channel outter loop so we can make best use of cache #pragma omp parallel for collapse(3) @@ -144,34 +144,34 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { const index_t tile_w = tile_index % block_shape_w; const index_t valid_h_start = std::max(block_h, (pad_top - tile_h - + block_shape_h - 1) - / block_shape_h); + + block_shape_h - 1) + / block_shape_h); const index_t valid_h_end = std::min(in_height, std::min( - block_h + block_h_size, - (out_height + pad_top - - tile_h - + block_shape_h - 1) - / block_shape_h)); + block_h + block_h_size, + (out_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); const index_t valid_w_start = std::max(static_cast(0), (pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); + + block_shape_w - 1) + / block_shape_w); const index_t valid_w_end = std::min(in_width, (out_width + pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); + + block_shape_w - 1) + / block_shape_w); const float *input_base = - input_data + (in_b * channels + c) * in_height * in_width; + input_data + (in_b * channels + c) * in_height * in_width; float *output_base = - output_data + (b * channels + c) * out_height * out_width; + output_data + (b * channels + c) * out_height * out_width; index_t h = valid_h_start * block_shape_h + tile_h - pad_top; for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { index_t w = valid_w_start * block_shape_w + tile_w - pad_left; for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { output_base[h * out_width + w] = - input_base[in_h * in_width + in_w]; + input_base[in_h * in_width + in_w]; w += block_shape_w; } // w h += block_shape_h; @@ -184,6 +184,93 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { } }; +template<> +struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { + BatchToSpaceFunctor(OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : BatchToSpaceFunctorBase(context, paddings, block_shape) {} + + MaceStatus operator()(Tensor *space_tensor, + Tensor *batch_tensor, + StatsFuture *future) { + MACE_UNUSED(future); + + std::vector output_shape(4, 0); + + CalculateBatchToSpaceOutputShape(batch_tensor, + DataFormat::NHWC, + output_shape.data()); + MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); + + Tensor::MappingGuard input_guard(batch_tensor); + Tensor::MappingGuard output_guard(space_tensor); + + int pad_top = paddings_[0]; + int pad_left = paddings_[2]; + int block_shape_h = block_shape_[0]; + int block_shape_w = block_shape_[1]; + + space_tensor->SetScale(batch_tensor->scale()); + space_tensor->SetZeroPoint(batch_tensor->zero_point()); + const uint8_t *input_data = batch_tensor->data(); + uint8_t *output_data = space_tensor->mutable_data(); + + index_t in_batches = batch_tensor->dim(0); + index_t in_height = batch_tensor->dim(1); + index_t in_width = batch_tensor->dim(2); + + index_t out_batches = space_tensor->dim(0); + index_t out_height = space_tensor->dim(1); + index_t out_width = space_tensor->dim(2); + index_t channels = space_tensor->dim(3); + +#pragma omp parallel for + for (index_t in_b = 0; in_b < in_batches; ++in_b) { + const index_t b = in_b % out_batches; + const index_t tile_index = in_b / out_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(static_cast(0), + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(in_height, + (out_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const index_t valid_w_end = std::min(in_width, + (out_width + pad_left + - tile_w + + block_shape_w - 1) + / block_shape_w); + const uint8_t *input_base = + input_data + in_b * in_height * in_width * channels; + uint8_t + *output_base = output_data + b * out_height * out_width * channels; + + index_t h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { + index_t w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { + memcpy(output_base + (h * out_width + w) * channels, + input_base + (in_h * in_width + in_w) * channels, + channels * sizeof(uint8_t)); + w += block_shape_w; + } // w + h += block_shape_h; + } // h + } // b + + return MACE_SUCCESS; + } +}; + #ifdef MACE_ENABLE_OPENCL template struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 92e57b4fde5fa39b0a5ae2801b4077633731eae7..4fab243ab16de40c0e6c13595c1e7c05944667ca 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -61,50 +61,72 @@ inline void ComputeInterpolationWeights( } } -inline float ComputeLerp(const float top_left, - const float top_right, - const float bottom_left, - const float bottom_right, - const float x_lerp, - const float y_lerp) { +template +inline T ComputeLerp(const T top_left, + const T top_right, + const T bottom_left, + const T bottom_right, + const float x_lerp, + const float y_lerp); + +template <> +inline float ComputeLerp(const float top_left, + const float top_right, + const float bottom_left, + const float bottom_right, + const float x_lerp, + const float y_lerp) { const float top = top_left + (top_right - top_left) * x_lerp; const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; return top + (bottom - top) * y_lerp; } -inline void ResizeImage(const float *images, - const index_t batch_size, - const index_t in_height, - const index_t in_width, - const index_t out_height, - const index_t out_width, - const index_t channels, - const std::vector &xs_vec, - const std::vector &ys, - float *output) { +template <> +inline uint8_t ComputeLerp(const uint8_t top_left, + const uint8_t top_right, + const uint8_t bottom_left, + const uint8_t bottom_right, + const float x_lerp, + const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return static_cast(roundf(top + (bottom - top) * y_lerp)); +} + +template +inline void ResizeImageNCHW(const T *images, + const index_t batch_size, + const index_t in_height, + const index_t in_width, + const index_t out_height, + const index_t out_width, + const index_t channels, + const std::vector &xs_vec, + const std::vector &ys, + T *output) { const CachedInterpolation *xs = xs_vec.data(); #pragma omp parallel for collapse(2) for (index_t b = 0; b < batch_size; ++b) { for (index_t c = 0; c < channels; ++c) { - const float + const T *channel_input_ptr = images + (b * channels + c) * in_height * in_width; - float *channel_output_ptr = + T *channel_output_ptr = output + (b * channels + c) * out_height * out_width; for (index_t y = 0; y < out_height; ++y) { - const float *y_lower_input_ptr = + const T *y_lower_input_ptr = channel_input_ptr + ys[y].lower * in_width; - const float *y_upper_input_ptr = + const T *y_upper_input_ptr = channel_input_ptr + ys[y].upper * in_width; const float ys_lerp = ys[y].lerp; for (index_t x = 0; x < out_width; ++x) { const float xs_lerp = xs[x].lerp; - const float top_left = y_lower_input_ptr[xs[x].lower]; - const float top_right = y_lower_input_ptr[xs[x].upper]; - const float bottom_left = y_upper_input_ptr[xs[x].lower]; - const float bottom_right = y_upper_input_ptr[xs[x].upper]; + const T top_left = y_lower_input_ptr[xs[x].lower]; + const T top_right = y_lower_input_ptr[xs[x].upper]; + const T bottom_left = y_upper_input_ptr[xs[x].lower]; + const T bottom_right = y_upper_input_ptr[xs[x].upper]; channel_output_ptr[y * out_width + x] = ComputeLerp(top_left, top_right, bottom_left, bottom_right, xs_lerp, ys_lerp); @@ -114,6 +136,48 @@ inline void ResizeImage(const float *images, } } +template +inline void ResizeImageNHWC(const T *images, + const index_t batch_size, + const index_t in_height, + const index_t in_width, + const index_t out_height, + const index_t out_width, + const index_t channels, + const std::vector &xs_vec, + const std::vector &ys, + T *output) { + const CachedInterpolation *xs = xs_vec.data(); + + for (index_t b = 0; b < batch_size; ++b) { + const T *input_base = images + b * channels * in_height * in_width; + T *output_base = output + b * channels * out_height * out_width; +#pragma omp parallel for + for (index_t y = 0; y < out_height; ++y) { + const T + *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; + const T + *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T *top_left = y_lower_input_ptr + xs[x].lower * channels; + const T *top_right = y_lower_input_ptr + xs[x].upper * channels; + const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels; + const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels; + + T *output_ptr = output_base + (y * out_width + x) * channels; + for (index_t c = 0; c < channels; ++c) { + output_ptr[c] = + ComputeLerp(top_left[c], top_right[c], bottom_left[c], + bottom_right[c], xs_lerp, ys_lerp); + } + } + } + } +} + struct ResizeBilinearFunctorBase : OpKernel { ResizeBilinearFunctorBase(OpKernelContext *context, const std::vector &size, @@ -132,11 +196,7 @@ struct ResizeBilinearFunctorBase : OpKernel { }; template -struct ResizeBilinearFunctor; - -template<> -struct ResizeBilinearFunctor - : ResizeBilinearFunctorBase { +struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ResizeBilinearFunctor(OpKernelContext *context, const std::vector &size, bool align_corners) @@ -159,8 +219,8 @@ struct ResizeBilinearFunctor Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard output_mapper(output); - const float *input_data = input->data(); - float *output_data = output->mutable_data(); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); if (out_height == in_height && out_width == in_width) { std::copy(input_data, @@ -181,8 +241,77 @@ struct ResizeBilinearFunctor ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); - ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, - channels, xs, ys, output_data); + ResizeImageNCHW(input_data, + batch, + in_height, + in_width, + out_height, + out_width, + channels, + xs, + ys, + output_data); + + return MACE_SUCCESS; + } +}; + +template +struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { + ResizeBilinearFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBilinearFunctorBase(context, size, align_corners) {} + + MaceStatus operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(future); + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + index_t out_height = out_height_; + index_t out_width = out_width_; + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector out_shape{batch, out_height, out_width, channels}; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + const uint8_t *input_data = input->data(); + uint8_t *output_data = output->mutable_data(); + + if (out_height == in_height && out_width == in_width) { + std::copy(input_data, + input_data + batch * in_height * in_width * channels , + output_data); + return MACE_SUCCESS; + } + + float height_scale = + CalculateResizeScale(in_height, out_height, align_corners_); + float width_scale = + CalculateResizeScale(in_width, out_width, align_corners_); + + std::vector ys(out_height + 1); + std::vector xs(out_width + 1); + + // Compute the cached interpolation weights on the x and y dimensions. + ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); + ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); + + ResizeImageNHWC(input_data, + batch, + in_height, + in_width, + out_height, + out_width, + channels, + xs, + ys, + output_data); return MACE_SUCCESS; } diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 94fdea951a8cd84fd05b5e8712add3bea334ea13..86982963644821f2f980a23d5a270c5a79396636 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -34,12 +34,12 @@ struct SpaceToBatchFunctorBase : OpKernel { SpaceToBatchFunctorBase(OpKernelContext *context, const std::vector &paddings, const std::vector &block_shape) - : OpKernel(context), - paddings_(paddings.begin(), paddings.end()), - block_shape_(block_shape.begin(), block_shape.end()) { + : OpKernel(context), + paddings_(paddings.begin(), paddings.end()), + block_shape_(block_shape.begin(), block_shape.end()) { MACE_CHECK( - block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, - "Block's shape should be 1D, and greater than 1"); + block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, + "Block's shape should be 1D, and greater than 1"); MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); } @@ -100,11 +100,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { SpaceToBatchFunctor(OpKernelContext *context, const std::vector &paddings, const std::vector &block_shape) - : SpaceToBatchFunctorBase(context, paddings, block_shape) {} + : SpaceToBatchFunctorBase(context, paddings, block_shape) {} MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future) { + Tensor *batch_tensor, + StatsFuture *future) { MACE_UNUSED(future); std::vector output_shape(4, 0); @@ -135,7 +135,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { index_t out_width = batch_tensor->dim(3); index_t block_h_size = - std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); + std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); // make channel outter loop so we can make best use of cache #pragma omp parallel for collapse(3) @@ -149,27 +149,27 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { const index_t tile_w = tile_index % block_shape_w; const index_t valid_h_start = std::max(block_h, (pad_top - tile_h - + block_shape_h - 1) - / block_shape_h); + + block_shape_h - 1) + / block_shape_h); const index_t valid_h_end = std::min(out_height, std::min( - block_h + block_h_size, - (in_height + pad_top - - tile_h - + block_shape_h - 1) - / block_shape_h)); + block_h + block_h_size, + (in_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); const index_t valid_w_start = std::max(static_cast(0), (pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); + + block_shape_w - 1) + / block_shape_w); const index_t valid_w_end = std::min(out_width, (in_width + pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); + + block_shape_w - 1) + / block_shape_w); const float *input_base = - input_data + (in_b * channels + c) * in_height * in_width; + input_data + (in_b * channels + c) * in_height * in_width; float *output_base = - output_data + (b * channels + c) * out_height * out_width; + output_data + (b * channels + c) * out_height * out_width; memset(output_base + block_h * out_width, 0, @@ -184,7 +184,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; for (index_t w = valid_w_start; w < valid_w_end; ++w) { output_base[h * out_width + w] = - input_base[in_h * in_width + in_w]; + input_base[in_h * in_width + in_w]; in_w += block_shape_w; } // w in_h += block_shape_h; @@ -197,7 +197,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { memset(output_base + valid_h_end * out_width, 0, (std::min(out_height, block_h + block_h_size) - valid_h_end) - * out_width * sizeof(float)); + * out_width * sizeof(float)); } // b } // block_h } // c @@ -205,6 +205,111 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { } }; +template<> +struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { + SpaceToBatchFunctor(OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : SpaceToBatchFunctorBase(context, paddings, block_shape) {} + + MaceStatus operator()(Tensor *space_tensor, + Tensor *batch_tensor, + StatsFuture *future) { + MACE_UNUSED(future); + + std::vector output_shape(4, 0); + + CalculateSpaceToBatchOutputShape(space_tensor, + DataFormat::NHWC, + output_shape.data()); + MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape)); + int zero_point = space_tensor->zero_point(); + + Tensor::MappingGuard input_guard(space_tensor); + Tensor::MappingGuard output_guard(batch_tensor); + + int pad_top = paddings_[0]; + int pad_left = paddings_[2]; + int block_shape_h = block_shape_[0]; + int block_shape_w = block_shape_[1]; + + batch_tensor->SetScale(space_tensor->scale()); + batch_tensor->SetZeroPoint(space_tensor->zero_point()); + const uint8_t *input_data = space_tensor->data(); + uint8_t *output_data = batch_tensor->mutable_data(); + + index_t in_batches = space_tensor->dim(0); + index_t in_height = space_tensor->dim(1); + index_t in_width = space_tensor->dim(2); + + index_t out_batches = batch_tensor->dim(0); + index_t out_height = batch_tensor->dim(1); + index_t out_width = batch_tensor->dim(2); + index_t channels = batch_tensor->dim(3); + +#pragma omp parallel for + for (index_t b = 0; b < out_batches; ++b) { + const index_t in_b = b % in_batches; + const index_t tile_index = b / in_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(static_cast(0), + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(out_height, + (in_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const index_t valid_w_end = std::min(out_width, + (in_width + pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const uint8_t *input_base = + input_data + in_b * channels * in_height * in_width; + uint8_t *output_base = + output_data + b * channels * out_height * out_width; + + memset(output_base, + zero_point, + valid_h_start * out_width * channels * sizeof(uint8_t)); + + index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t h = valid_h_start; h < valid_h_end; ++h) { + memset(output_base + h * out_width * channels, + zero_point, + valid_w_start * channels * sizeof(uint8_t)); + + index_t + in_w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t w = valid_w_start; w < valid_w_end; ++w) { + memcpy(output_base + (h * out_width + w) * channels, + input_base + (in_h * in_width + in_w) * channels, + sizeof(uint8_t) * channels); + in_w += block_shape_w; + } // w + in_h += block_shape_h; + + memset(output_base + (h * out_width + valid_w_end) * channels, + zero_point, + (out_width - valid_w_end) * channels * sizeof(uint8_t)); + } // h + + memset(output_base + valid_h_end * out_width * channels, + zero_point, + (out_height - valid_h_end) * out_width * channels + * sizeof(uint8_t)); + } // b + + return MACE_SUCCESS; + } +}; + #ifdef MACE_ENABLE_OPENCL template struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index b0ffd66bdf38fb0f96de601588e67cc32cb1874a..103e12977c01021fc0a3d4008558935da96b9ab5 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -23,6 +23,11 @@ void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) { .TypeConstraint("T") .Build(), BatchToSpaceNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + BatchToSpaceNDOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") .Device(DeviceType::GPU) diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index c281f0cce2f6ce2600b93a769a94f89451d22a95..6a860a42b4c13ab3e37ebd27a287ae2cc5e4dbf8 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -28,6 +28,11 @@ void Register_Concat(OperatorRegistryBase *op_registry) { .TypeConstraint("T") .Build(), ConcatOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ConcatOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") .Device(DeviceType::GPU) diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 3a248ac17249adadf14123a7b124eb93fc057594..7e9a2df8d19bc4c3aab07fec12dcb39638fd46e9 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -162,11 +162,13 @@ class OpsTestNet { std::fill(input_data, input_data + input->size(), data); } - template + template void AddRandomInput(const std::string &name, const std::vector &shape, bool positive = true, - bool truncate = false) { + bool truncate = false, + const float truncate_min = 0.001f, + const float truncate_max = 100.f) { Tensor *input = ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), DataTypeToEnum::v()); @@ -180,21 +182,22 @@ class OpsTestNet { if (DataTypeToEnum::value == DT_HALF) { std::generate( input_data, input_data + input->size(), - [&gen, &nd, positive, truncate] { + [&gen, &nd, positive, truncate, truncate_min, truncate_max] { float d = nd(gen); if (truncate) { - if (std::abs(d) > 100.f) d = 100.f; - if (std::abs(d) < 0.001f) d = 0.001f; + if (std::abs(d) > truncate_max) d = truncate_max; + if (std::abs(d) < truncate_min) d = truncate_min; } return half_float::half_cast(positive ? std::abs(d) : d); }); } else { std::generate(input_data, input_data + input->size(), - [&gen, &nd, positive, truncate] { + [&gen, &nd, positive, truncate, + truncate_min, truncate_max] { float d = nd(gen); if (truncate) { - if (std::abs(d) > 100.f) d = 100.f; - if (std::abs(d) < 0.001f) d = 0.001f; + if (std::abs(d) > truncate_max) d = truncate_max; + if (std::abs(d) < truncate_min) d = truncate_min; } return (positive ? std::abs(d) : d); }); diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 82bbfd0a3aea8caa88a22821852435dcd9567e62..3106256955383366225fd24b97bbff8b49e9132d 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -24,6 +24,12 @@ void Register_ResizeBilinear(OperatorRegistryBase *op_registry) { .Build(), ResizeBilinearOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + ResizeBilinearOp); + #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") .Device(DeviceType::GPU) diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 1f21780135a1cca24788630a796c6a6028b31a53..993d7269a44f44de337dd4403f31e085ca557f3e 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -36,8 +36,15 @@ void ResizeBilinearBenchmark(int iters, // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", - {batch, channels, input_height, input_width}); + if (DataTypeToEnum::value != DT_UINT8) { + net.AddRandomInput("Input", + {batch, channels, input_height, + input_width}); + } else { + net.AddRandomInput("Input", + {batch, input_height, input_width, + channels}); + } } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, input_height, input_width, channels}); @@ -99,6 +106,7 @@ void ResizeBilinearBenchmark(int iters, #define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ + MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, uint8_t, CPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 5d284f867a88c4acedbeb9293372dba7b9e1ea9d..c628bd9b1649fc6d6abb440dd70e294921131f62 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -140,12 +140,89 @@ void TestRandomResizeBilinear() { 1e-6); } } + +void TestQuantizedResizeBilinear() { + testing::internal::LogToStderr(); + static unsigned int seed = time(NULL); + for (int round = 0; round < 10; ++round) { + int batch = 1 + rand_r(&seed) % 5; + int channels = 1 + rand_r(&seed) % 100; + int height = 1 + rand_r(&seed) % 100; + int width = 1 + rand_r(&seed) % 100; + int in_height = 1 + rand_r(&seed) % 100; + int in_width = 1 + rand_r(&seed) % 100; + int align_corners = rand_r(&seed) % 1; + + // Construct graph + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", + {batch, in_height, in_width, channels}, + false, + true, + -1.f, + 1.f); + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + + OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntArg("align_corners", align_corners) + .AddIntsArg("size", {height, width}) + .Finalize(net.NewOperatorDef()); + // Run on CPU + net.RunOp(DeviceType::CPU); + net.TransformDataFormat("OutputNCHW", NCHW, + "Output", NHWC); + + // run quantize + OpDefBuilder("Quantize", "QuantizeInput") + .Input("Input") + .Output("QuantizedInput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") + .Input("QuantizedInput") + .Output("QuantizedOutput") + .AddIntArg("align_corners", align_corners) + .AddIntsArg("size", {height, width}) + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + Tensor *eq_output = net.GetTensor("QuantizedInput"); + Tensor *q_output = net.GetTensor("QuantizedOutput"); + q_output->SetScale(eq_output->scale()); + q_output->SetZeroPoint(eq_output->zero_point()); + OpDefBuilder("Dequantize", "DeQuantizeTest") + .Input("QuantizedOutput") + .Output("DequantizedOutput") + .OutputType({DT_FLOAT}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + // Check + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("DequantizedOutput"), 0.01); + } +} + } // namespace TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) { TestRandomResizeBilinear(); } +TEST_F(ResizeBilinearTest, QuantizedResizeBilinear) { + TestQuantizedResizeBilinear(); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index e0291172bc1daccbef28c9662e4d0fc07657c8f6..29dbed979a842bb7601b83d5d0c6c27610bc988f 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -23,6 +23,11 @@ void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) { .TypeConstraint("T") .Build(), SpaceToBatchNDOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + SpaceToBatchNDOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") .Device(DeviceType::GPU) diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index 272c487c74e74764c24f7d5edc0de35eceac8dd6..faff487a710752c501150588740ece24e7b4e45a 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -28,7 +28,13 @@ void BMSpaceToBatch( OpsTestNet net; if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + if (DataTypeToEnum::value != DT_UINT8) { + net.AddRandomInput( + "Input", {batch, channels, height, width}); + } else { + net.AddRandomInput( + "Input", {batch, height, width, channels}); + } } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } @@ -39,6 +45,7 @@ void BMSpaceToBatch( .Output("Output") .AddIntsArg("paddings", {shape, shape, shape, shape}) .AddIntsArg("block_shape", {shape, shape}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", @@ -78,7 +85,8 @@ void BMSpaceToBatch( #define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ - MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); \ + MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, uint8_t, CPU); MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 8a3c35feff500ccf180b23de814ca5c89569c74b..1d07ecfc03463b72d97fea287e39980ccb8839eb 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -147,62 +147,7 @@ void TestBidirectionalTransform(const std::vector &space_shape, RunBatchToSpace(batch_shape, batch_data, block_data, padding_data, space_tensor.get()); } -} // namespace - -TEST(SpaceToBatchTest, SmallData) { - TestBidirectionalTransform({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, - {0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4}); -} - -TEST(SpaceToBatchTest, SmallDataWithOnePadding) { - TestBidirectionalTransform({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3}, - {1, 0, 1, 0}, {9, 1, 1, 1}, - {0, 0, 0, 0, 1, 2, 0, 3, 4}); -} - -TEST(SpaceToBatchTest, SmallDataWithTwoPadding) { - TestBidirectionalTransform( - {1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1}, - {0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0}); -} - -TEST(SpaceToBatchTest, SmallDataWithLargeImage) { - TestBidirectionalTransform( - {1, 2, 10, 1}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, - {2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1}, - {1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20}); -} -TEST(SpaceToBatchTest, MultiChannelData) { - TestBidirectionalTransform( - {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2}, - {0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); -} - -TEST(SpaceToBatchTest, LargerMultiChannelData) { - TestBidirectionalTransform( - {1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - {2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1}, - {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16}); -} - -TEST(SpaceToBatchTest, MultiBatchData) { - TestBidirectionalTransform( - {2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1}, - {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16}); -} - -TEST(SpaceToBatchTest, MultiBatchAndChannelData) { - TestBidirectionalTransform( - {2, 2, 4, 2}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, - {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2}, - {1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24, - 9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32}); -} void TestSpaceToBatchLargeInput(const std::vector &input_shape, const std::vector &block_shape_data, @@ -278,6 +223,173 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, *net.GetOutput("OutputGPU")); } +void TestSpaceToBatchQuantize(const std::vector &input_shape, + const std::vector &block_shape_data, + const std::vector &padding_data) { + OpsTestNet net; + net.AddRandomInput("Input", input_shape, false, true, -1.f, 1.f); + + // run cpu + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("paddings", padding_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + net.TransformDataFormat("OutputNCHW", NCHW, + "OutputCPU", NHWC); + + // run quantize + OpDefBuilder("Quantize", "QuantizeInput") + .Input("Input") + .Output("QuantizedInput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("QuantizedInput") + .Output("QuantizedOutput") + .AddIntsArg("paddings", padding_data) + .AddIntsArg("block_shape", block_shape_data) + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + Tensor *eq_output = net.GetTensor("QuantizedInput"); + Tensor *q_output = net.GetTensor("QuantizedOutput"); + q_output->SetScale(eq_output->scale()); + q_output->SetZeroPoint(eq_output->zero_point()); + OpDefBuilder("Dequantize", "DeQuantizeTest") + .Input("QuantizedOutput") + .Output("DequantizedOutput") + .OutputType({DT_FLOAT}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + // Check + ExpectTensorSimilar(*net.GetOutput("OutputCPU"), + *net.GetTensor("DequantizedOutput"), 0.01); +} + +void TestoBatchToSpaceQuantize(const std::vector &input_shape, + const std::vector &block_shape_data, + const std::vector &crops_data) { + OpsTestNet net; + net.AddRandomInput("Input", input_shape, false, true, -1.f, 1.f); + + // run cpu + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("crops", crops_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + net.TransformDataFormat("OutputNCHW", NCHW, + "OutputCPU", NHWC); + + // run quantize + OpDefBuilder("Quantize", "QuantizeInput") + .Input("Input") + .Output("QuantizedInput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("QuantizedInput") + .Output("QuantizedOutput") + .AddIntsArg("crops", crops_data) + .AddIntsArg("block_shape", block_shape_data) + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + Tensor *eq_output = net.GetTensor("QuantizedInput"); + Tensor *q_output = net.GetTensor("QuantizedOutput"); + q_output->SetScale(eq_output->scale()); + q_output->SetZeroPoint(eq_output->zero_point()); + OpDefBuilder("Dequantize", "DeQuantizeTest") + .Input("QuantizedOutput") + .Output("DequantizedOutput") + .OutputType({DT_FLOAT}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + // Check + ExpectTensorSimilar(*net.GetOutput("OutputCPU"), + *net.GetTensor("DequantizedOutput"), 0.01); +} + +} // namespace + +TEST(SpaceToBatchTest, SmallData) { + TestBidirectionalTransform({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, + {0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4}); +} + +TEST(SpaceToBatchTest, SmallDataWithOnePadding) { + TestBidirectionalTransform({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3}, + {1, 0, 1, 0}, {9, 1, 1, 1}, + {0, 0, 0, 0, 1, 2, 0, 3, 4}); +} + +TEST(SpaceToBatchTest, SmallDataWithTwoPadding) { + TestBidirectionalTransform( + {1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1}, + {0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0}); +} + +TEST(SpaceToBatchTest, SmallDataWithLargeImage) { + TestBidirectionalTransform( + {1, 2, 10, 1}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, + {2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1}, + {1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20}); +} + +TEST(SpaceToBatchTest, MultiChannelData) { + TestBidirectionalTransform( + {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2}, + {0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); +} + +TEST(SpaceToBatchTest, LargerMultiChannelData) { + TestBidirectionalTransform( + {1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + {2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1}, + {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16}); +} + +TEST(SpaceToBatchTest, MultiBatchData) { + TestBidirectionalTransform( + {2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1}, + {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16}); +} + +TEST(SpaceToBatchTest, MultiBatchAndChannelData) { + TestBidirectionalTransform( + {2, 2, 4, 2}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2}, + {1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24, + 9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32}); +} + TEST(SpaceToBatchTest, LargeData) { TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0}); TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4}); @@ -285,6 +397,13 @@ TEST(SpaceToBatchTest, LargeData) { TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4}); } +TEST(SpaceToBatchTest, Quantize) { + TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0}); + TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4}); + TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0}); + TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4}); +} + } // namespace test } // namespace ops } // namespace mace