提交 a5c9db3a 编写于 作者: 李寅

Implement quantized resize_bilinear, b2s, s2b

上级 df23f428
...@@ -35,12 +35,12 @@ struct BatchToSpaceFunctorBase : OpKernel { ...@@ -35,12 +35,12 @@ struct BatchToSpaceFunctorBase : OpKernel {
BatchToSpaceFunctorBase(OpKernelContext *context, BatchToSpaceFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: OpKernel(context), : OpKernel(context),
paddings_(paddings.begin(), paddings.end()), paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) { block_shape_(block_shape.begin(), block_shape.end()) {
MACE_CHECK( MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1"); "Block's shape should be 1D, and greater than 1");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
} }
...@@ -94,11 +94,11 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -94,11 +94,11 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context, BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {} : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
...@@ -107,8 +107,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -107,8 +107,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
output_shape.data()); output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(batch_tensor);
Tensor::MappingGuard output_guard(batch_tensor); Tensor::MappingGuard output_guard(space_tensor);
int pad_top = paddings_[0]; int pad_top = paddings_[0];
int pad_left = paddings_[2]; int pad_left = paddings_[2];
...@@ -129,8 +129,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -129,8 +129,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
// 32k/sizeof(float)/out_width/block_shape // 32k/sizeof(float)/out_width/block_shape
index_t index_t
block_h_size = block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width); std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
...@@ -144,34 +144,34 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -144,34 +144,34 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
const index_t tile_w = tile_index % block_shape_w; const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h, const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h (pad_top - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h); / block_shape_h);
const index_t valid_h_end = std::min(in_height, const index_t valid_h_end = std::min(in_height,
std::min( std::min(
block_h + block_h_size, block_h + block_h_size,
(out_height + pad_top (out_height + pad_top
- tile_h - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h)); / block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0), const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w (pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const index_t valid_w_end = std::min(in_width, const index_t valid_w_end = std::min(in_width,
(out_width + pad_left - tile_w (out_width + pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const float *input_base = const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width; input_data + (in_b * channels + c) * in_height * in_width;
float *output_base = float *output_base =
output_data + (b * channels + c) * out_height * out_width; output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top; index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left; index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
output_base[h * out_width + w] = output_base[h * out_width + w] =
input_base[in_h * in_width + in_w]; input_base[in_h * in_width + in_w];
w += block_shape_w; w += block_shape_w;
} // w } // w
h += block_shape_h; h += block_shape_h;
...@@ -184,6 +184,93 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -184,6 +184,93 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
} }
}; };
template<>
struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NHWC,
output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(batch_tensor);
Tensor::MappingGuard output_guard(space_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
space_tensor->SetScale(batch_tensor->scale());
space_tensor->SetZeroPoint(batch_tensor->zero_point());
const uint8_t *input_data = batch_tensor->data<uint8_t>();
uint8_t *output_data = space_tensor->mutable_data<uint8_t>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(1);
index_t in_width = batch_tensor->dim(2);
index_t out_batches = space_tensor->dim(0);
index_t out_height = space_tensor->dim(1);
index_t out_width = space_tensor->dim(2);
index_t channels = space_tensor->dim(3);
#pragma omp parallel for
for (index_t in_b = 0; in_b < in_batches; ++in_b) {
const index_t b = in_b % out_batches;
const index_t tile_index = in_b / out_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(static_cast<index_t>(0),
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(in_height,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left
- tile_w
+ block_shape_w - 1)
/ block_shape_w);
const uint8_t *input_base =
input_data + in_b * in_height * in_width * channels;
uint8_t
*output_base = output_data + b * out_height * out_width * channels;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
memcpy(output_base + (h * out_width + w) * channels,
input_base + (in_h * in_width + in_w) * channels,
channels * sizeof(uint8_t));
w += block_shape_w;
} // w
h += block_shape_h;
} // h
} // b
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase { struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
......
...@@ -61,50 +61,72 @@ inline void ComputeInterpolationWeights( ...@@ -61,50 +61,72 @@ inline void ComputeInterpolationWeights(
} }
} }
inline float ComputeLerp(const float top_left, template <typename T>
const float top_right, inline T ComputeLerp(const T top_left,
const float bottom_left, const T top_right,
const float bottom_right, const T bottom_left,
const float x_lerp, const T bottom_right,
const float y_lerp) { const float x_lerp,
const float y_lerp);
template <>
inline float ComputeLerp<float>(const float top_left,
const float top_right,
const float bottom_left,
const float bottom_right,
const float x_lerp,
const float y_lerp) {
const float top = top_left + (top_right - top_left) * x_lerp; const float top = top_left + (top_right - top_left) * x_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
return top + (bottom - top) * y_lerp; return top + (bottom - top) * y_lerp;
} }
inline void ResizeImage(const float *images, template <>
const index_t batch_size, inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
const index_t in_height, const uint8_t top_right,
const index_t in_width, const uint8_t bottom_left,
const index_t out_height, const uint8_t bottom_right,
const index_t out_width, const float x_lerp,
const index_t channels, const float y_lerp) {
const std::vector<CachedInterpolation> &xs_vec, const float top = top_left + (top_right - top_left) * x_lerp;
const std::vector<CachedInterpolation> &ys, const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
float *output) { return static_cast<uint8_t>(roundf(top + (bottom - top) * y_lerp));
}
template <typename T>
inline void ResizeImageNCHW(const T *images,
const index_t batch_size,
const index_t in_height,
const index_t in_width,
const index_t out_height,
const index_t out_width,
const index_t channels,
const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys,
T *output) {
const CachedInterpolation *xs = xs_vec.data(); const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const float const T
*channel_input_ptr = *channel_input_ptr =
images + (b * channels + c) * in_height * in_width; images + (b * channels + c) * in_height * in_width;
float *channel_output_ptr = T *channel_output_ptr =
output + (b * channels + c) * out_height * out_width; output + (b * channels + c) * out_height * out_width;
for (index_t y = 0; y < out_height; ++y) { for (index_t y = 0; y < out_height; ++y) {
const float *y_lower_input_ptr = const T *y_lower_input_ptr =
channel_input_ptr + ys[y].lower * in_width; channel_input_ptr + ys[y].lower * in_width;
const float *y_upper_input_ptr = const T *y_upper_input_ptr =
channel_input_ptr + ys[y].upper * in_width; channel_input_ptr + ys[y].upper * in_width;
const float ys_lerp = ys[y].lerp; const float ys_lerp = ys[y].lerp;
for (index_t x = 0; x < out_width; ++x) { for (index_t x = 0; x < out_width; ++x) {
const float xs_lerp = xs[x].lerp; const float xs_lerp = xs[x].lerp;
const float top_left = y_lower_input_ptr[xs[x].lower]; const T top_left = y_lower_input_ptr[xs[x].lower];
const float top_right = y_lower_input_ptr[xs[x].upper]; const T top_right = y_lower_input_ptr[xs[x].upper];
const float bottom_left = y_upper_input_ptr[xs[x].lower]; const T bottom_left = y_upper_input_ptr[xs[x].lower];
const float bottom_right = y_upper_input_ptr[xs[x].upper]; const T bottom_right = y_upper_input_ptr[xs[x].upper];
channel_output_ptr[y * out_width + x] = channel_output_ptr[y * out_width + x] =
ComputeLerp(top_left, top_right, bottom_left, ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp); bottom_right, xs_lerp, ys_lerp);
...@@ -114,6 +136,48 @@ inline void ResizeImage(const float *images, ...@@ -114,6 +136,48 @@ inline void ResizeImage(const float *images,
} }
} }
template <typename T>
inline void ResizeImageNHWC(const T *images,
const index_t batch_size,
const index_t in_height,
const index_t in_width,
const index_t out_height,
const index_t out_width,
const index_t channels,
const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys,
T *output) {
const CachedInterpolation *xs = xs_vec.data();
for (index_t b = 0; b < batch_size; ++b) {
const T *input_base = images + b * channels * in_height * in_width;
T *output_base = output + b * channels * out_height * out_width;
#pragma omp parallel for
for (index_t y = 0; y < out_height; ++y) {
const T
*y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
const T
*y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
const float ys_lerp = ys[y].lerp;
for (index_t x = 0; x < out_width; ++x) {
const float xs_lerp = xs[x].lerp;
const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
T *output_ptr = output_base + (y * out_width + x) * channels;
for (index_t c = 0; c < channels; ++c) {
output_ptr[c] =
ComputeLerp(top_left[c], top_right[c], bottom_left[c],
bottom_right[c], xs_lerp, ys_lerp);
}
}
}
}
}
struct ResizeBilinearFunctorBase : OpKernel { struct ResizeBilinearFunctorBase : OpKernel {
ResizeBilinearFunctorBase(OpKernelContext *context, ResizeBilinearFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size, const std::vector<index_t> &size,
...@@ -132,11 +196,7 @@ struct ResizeBilinearFunctorBase : OpKernel { ...@@ -132,11 +196,7 @@ struct ResizeBilinearFunctorBase : OpKernel {
}; };
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct ResizeBilinearFunctor; struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
template<>
struct ResizeBilinearFunctor<DeviceType::CPU, float>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(OpKernelContext *context, ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size, const std::vector<index_t> &size,
bool align_corners) bool align_corners)
...@@ -159,8 +219,8 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -159,8 +219,8 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
if (out_height == in_height && out_width == in_width) { if (out_height == in_height && out_width == in_width) {
std::copy(input_data, std::copy(input_data,
...@@ -181,8 +241,77 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -181,8 +241,77 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, ResizeImageNCHW(input_data,
channels, xs, ys, output_data); batch,
in_height,
in_width,
out_height,
out_width,
channels,
xs,
ys,
output_data);
return MACE_SUCCESS;
}
};
template<DeviceType D>
struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
index_t out_height = out_height_;
index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, out_height, out_width, channels};
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output);
const uint8_t *input_data = input->data<uint8_t>();
uint8_t *output_data = output->mutable_data<uint8_t>();
if (out_height == in_height && out_width == in_width) {
std::copy(input_data,
input_data + batch * in_height * in_width * channels ,
output_data);
return MACE_SUCCESS;
}
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
std::vector<CachedInterpolation> ys(out_height + 1);
std::vector<CachedInterpolation> xs(out_width + 1);
// Compute the cached interpolation weights on the x and y dimensions.
ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
ResizeImageNHWC(input_data,
batch,
in_height,
in_width,
out_height,
out_width,
channels,
xs,
ys,
output_data);
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -34,12 +34,12 @@ struct SpaceToBatchFunctorBase : OpKernel { ...@@ -34,12 +34,12 @@ struct SpaceToBatchFunctorBase : OpKernel {
SpaceToBatchFunctorBase(OpKernelContext *context, SpaceToBatchFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: OpKernel(context), : OpKernel(context),
paddings_(paddings.begin(), paddings.end()), paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) { block_shape_(block_shape.begin(), block_shape.end()) {
MACE_CHECK( MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1"); "Block's shape should be 1D, and greater than 1");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
} }
...@@ -100,11 +100,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -100,11 +100,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {} : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
...@@ -135,7 +135,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -135,7 +135,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t out_width = batch_tensor->dim(3); index_t out_width = batch_tensor->dim(3);
index_t block_h_size = index_t block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width); std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
...@@ -149,27 +149,27 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -149,27 +149,27 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
const index_t tile_w = tile_index % block_shape_w; const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h, const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h (pad_top - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h); / block_shape_h);
const index_t valid_h_end = std::min(out_height, const index_t valid_h_end = std::min(out_height,
std::min( std::min(
block_h + block_h_size, block_h + block_h_size,
(in_height + pad_top (in_height + pad_top
- tile_h - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h)); / block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0), const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w (pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const index_t valid_w_end = std::min(out_width, const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w (in_width + pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const float *input_base = const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width; input_data + (in_b * channels + c) * in_height * in_width;
float *output_base = float *output_base =
output_data + (b * channels + c) * out_height * out_width; output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width, memset(output_base + block_h * out_width,
0, 0,
...@@ -184,7 +184,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -184,7 +184,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) { for (index_t w = valid_w_start; w < valid_w_end; ++w) {
output_base[h * out_width + w] = output_base[h * out_width + w] =
input_base[in_h * in_width + in_w]; input_base[in_h * in_width + in_w];
in_w += block_shape_w; in_w += block_shape_w;
} // w } // w
in_h += block_shape_h; in_h += block_shape_h;
...@@ -197,7 +197,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -197,7 +197,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
memset(output_base + valid_h_end * out_width, memset(output_base + valid_h_end * out_width,
0, 0,
(std::min(out_height, block_h + block_h_size) - valid_h_end) (std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float)); * out_width * sizeof(float));
} // b } // b
} // block_h } // block_h
} // c } // c
...@@ -205,6 +205,111 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -205,6 +205,111 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
} }
}; };
template<>
struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NHWC,
output_shape.data());
MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
int zero_point = space_tensor->zero_point();
Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
batch_tensor->SetScale(space_tensor->scale());
batch_tensor->SetZeroPoint(space_tensor->zero_point());
const uint8_t *input_data = space_tensor->data<uint8_t>();
uint8_t *output_data = batch_tensor->mutable_data<uint8_t>();
index_t in_batches = space_tensor->dim(0);
index_t in_height = space_tensor->dim(1);
index_t in_width = space_tensor->dim(2);
index_t out_batches = batch_tensor->dim(0);
index_t out_height = batch_tensor->dim(1);
index_t out_width = batch_tensor->dim(2);
index_t channels = batch_tensor->dim(3);
#pragma omp parallel for
for (index_t b = 0; b < out_batches; ++b) {
const index_t in_b = b % in_batches;
const index_t tile_index = b / in_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(static_cast<index_t>(0),
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(out_height,
(in_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const uint8_t *input_base =
input_data + in_b * channels * in_height * in_width;
uint8_t *output_base =
output_data + b * channels * out_height * out_width;
memset(output_base,
zero_point,
valid_h_start * out_width * channels * sizeof(uint8_t));
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) {
memset(output_base + h * out_width * channels,
zero_point,
valid_w_start * channels * sizeof(uint8_t));
index_t
in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) {
memcpy(output_base + (h * out_width + w) * channels,
input_base + (in_h * in_width + in_w) * channels,
sizeof(uint8_t) * channels);
in_w += block_shape_w;
} // w
in_h += block_shape_h;
memset(output_base + (h * out_width + valid_w_end) * channels,
zero_point,
(out_width - valid_w_end) * channels * sizeof(uint8_t));
} // h
memset(output_base + valid_h_end * out_width * channels,
zero_point,
(out_height - valid_h_end) * out_width * channels
* sizeof(uint8_t));
} // b
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
......
...@@ -23,6 +23,11 @@ void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) { ...@@ -23,6 +23,11 @@ void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) {
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BatchToSpaceNDOp<DeviceType::CPU, float>); BatchToSpaceNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
......
...@@ -28,6 +28,11 @@ void Register_Concat(OperatorRegistryBase *op_registry) { ...@@ -28,6 +28,11 @@ void Register_Concat(OperatorRegistryBase *op_registry) {
.TypeConstraint<int32_t>("T") .TypeConstraint<int32_t>("T")
.Build(), .Build(),
ConcatOp<DeviceType::CPU, int32_t>); ConcatOp<DeviceType::CPU, int32_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
ConcatOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
......
...@@ -162,11 +162,13 @@ class OpsTestNet { ...@@ -162,11 +162,13 @@ class OpsTestNet {
std::fill(input_data, input_data + input->size(), data); std::fill(input_data, input_data + input->size(), data);
} }
template <DeviceType D, typename T> template<DeviceType D, typename T>
void AddRandomInput(const std::string &name, void AddRandomInput(const std::string &name,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
bool positive = true, bool positive = true,
bool truncate = false) { bool truncate = false,
const float truncate_min = 0.001f,
const float truncate_max = 100.f) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v()); DataTypeToEnum<T>::v());
...@@ -180,21 +182,22 @@ class OpsTestNet { ...@@ -180,21 +182,22 @@ class OpsTestNet {
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate( std::generate(
input_data, input_data + input->size(), input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] { [&gen, &nd, positive, truncate, truncate_min, truncate_max] {
float d = nd(gen); float d = nd(gen);
if (truncate) { if (truncate) {
if (std::abs(d) > 100.f) d = 100.f; if (std::abs(d) > truncate_max) d = truncate_max;
if (std::abs(d) < 0.001f) d = 0.001f; if (std::abs(d) < truncate_min) d = truncate_min;
} }
return half_float::half_cast<half>(positive ? std::abs(d) : d); return half_float::half_cast<half>(positive ? std::abs(d) : d);
}); });
} else { } else {
std::generate(input_data, input_data + input->size(), std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] { [&gen, &nd, positive, truncate,
truncate_min, truncate_max] {
float d = nd(gen); float d = nd(gen);
if (truncate) { if (truncate) {
if (std::abs(d) > 100.f) d = 100.f; if (std::abs(d) > truncate_max) d = truncate_max;
if (std::abs(d) < 0.001f) d = 0.001f; if (std::abs(d) < truncate_min) d = truncate_min;
} }
return (positive ? std::abs(d) : d); return (positive ? std::abs(d) : d);
}); });
......
...@@ -24,6 +24,12 @@ void Register_ResizeBilinear(OperatorRegistryBase *op_registry) { ...@@ -24,6 +24,12 @@ void Register_ResizeBilinear(OperatorRegistryBase *op_registry) {
.Build(), .Build(),
ResizeBilinearOp<DeviceType::CPU, float>); ResizeBilinearOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
ResizeBilinearOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
......
...@@ -36,8 +36,15 @@ void ResizeBilinearBenchmark(int iters, ...@@ -36,8 +36,15 @@ void ResizeBilinearBenchmark(int iters,
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", if (DataTypeToEnum<T>::value != DT_UINT8) {
{batch, channels, input_height, input_width}); net.AddRandomInput<D, float>("Input",
{batch, channels, input_height,
input_width});
} else {
net.AddRandomInput<D, uint8_t>("Input",
{batch, input_height, input_width,
channels});
}
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", net.AddRandomInput<D, float>("Input",
{batch, input_height, input_width, channels}); {batch, input_height, input_width, channels});
...@@ -99,6 +106,7 @@ void ResizeBilinearBenchmark(int iters, ...@@ -99,6 +106,7 @@ void ResizeBilinearBenchmark(int iters,
#define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \ #define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, uint8_t, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \ MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU); MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
......
...@@ -140,12 +140,89 @@ void TestRandomResizeBilinear() { ...@@ -140,12 +140,89 @@ void TestRandomResizeBilinear() {
1e-6); 1e-6);
} }
} }
void TestQuantizedResizeBilinear() {
testing::internal::LogToStderr();
static unsigned int seed = time(NULL);
for (int round = 0; round < 10; ++round) {
int batch = 1 + rand_r(&seed) % 5;
int channels = 1 + rand_r(&seed) % 100;
int height = 1 + rand_r(&seed) % 100;
int width = 1 + rand_r(&seed) % 100;
int in_height = 1 + rand_r(&seed) % 100;
int in_width = 1 + rand_r(&seed) % 100;
int align_corners = rand_r(&seed) % 1;
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>("Input",
{batch, in_height, in_width, channels},
false,
true,
-1.f,
1.f);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("Output"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
}
} // namespace } // namespace
TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) { TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::GPU>(); TestRandomResizeBilinear<DeviceType::GPU>();
} }
TEST_F(ResizeBilinearTest, QuantizedResizeBilinear) {
TestQuantizedResizeBilinear();
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -23,6 +23,11 @@ void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) { ...@@ -23,6 +23,11 @@ void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) {
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
SpaceToBatchNDOp<DeviceType::CPU, float>); SpaceToBatchNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
......
...@@ -28,7 +28,13 @@ void BMSpaceToBatch( ...@@ -28,7 +28,13 @@ void BMSpaceToBatch(
OpsTestNet net; OpsTestNet net;
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); if (DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, float>(
"Input", {batch, channels, height, width});
} else {
net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Input", {batch, height, width, channels});
}
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
...@@ -39,6 +45,7 @@ void BMSpaceToBatch( ...@@ -39,6 +45,7 @@ void BMSpaceToBatch(
.Output("Output") .Output("Output")
.AddIntsArg("paddings", {shape, shape, shape, shape}) .AddIntsArg("paddings", {shape, shape, shape, shape})
.AddIntsArg("block_shape", {shape, shape}) .AddIntsArg("block_shape", {shape, shape})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
...@@ -78,7 +85,8 @@ void BMSpaceToBatch( ...@@ -78,7 +85,8 @@ void BMSpaceToBatch(
#define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ #define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, uint8_t, CPU);
MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
......
...@@ -147,62 +147,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape, ...@@ -147,62 +147,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data, RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get()); padding_data, space_tensor.get());
} }
} // namespace
TEST(SpaceToBatchTest, SmallData) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
{1, 0, 1, 0}, {9, 1, 1, 1},
{0, 0, 0, 0, 1, 2, 0, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
TestBidirectionalTransform<float>(
{1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
{0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
}
TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
TestBidirectionalTransform<float>(
{1, 2, 10, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
{2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
}
TEST(SpaceToBatchTest, MultiChannelData) {
TestBidirectionalTransform<float>(
{1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
TEST(SpaceToBatchTest, LargerMultiChannelData) {
TestBidirectionalTransform<float>(
{1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
}
void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape, void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data, const std::vector<int> &block_shape_data,
...@@ -278,6 +223,173 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape, ...@@ -278,6 +223,173 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
*net.GetOutput("OutputGPU")); *net.GetOutput("OutputGPU"));
} }
void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) {
OpsTestNet net;
net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) {
OpsTestNet net;
net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
} // namespace
TEST(SpaceToBatchTest, SmallData) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
{1, 0, 1, 0}, {9, 1, 1, 1},
{0, 0, 0, 0, 1, 2, 0, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
TestBidirectionalTransform<float>(
{1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
{0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
}
TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
TestBidirectionalTransform<float>(
{1, 2, 10, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
{2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
}
TEST(SpaceToBatchTest, MultiChannelData) {
TestBidirectionalTransform<float>(
{1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
TEST(SpaceToBatchTest, LargerMultiChannelData) {
TestBidirectionalTransform<float>(
{1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
}
TEST(SpaceToBatchTest, LargeData) { TEST(SpaceToBatchTest, LargeData) {
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0}); TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4}); TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
...@@ -285,6 +397,13 @@ TEST(SpaceToBatchTest, LargeData) { ...@@ -285,6 +397,13 @@ TEST(SpaceToBatchTest, LargeData) {
TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4}); TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
} }
TEST(SpaceToBatchTest, Quantize) {
TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0});
TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册