提交 a5c9db3a 编写于 作者: 李寅

Implement quantized resize_bilinear, b2s, s2b

上级 df23f428
......@@ -35,12 +35,12 @@ struct BatchToSpaceFunctorBase : OpKernel {
BatchToSpaceFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) {
: OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) {
MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1");
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
}
......@@ -94,11 +94,11 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
......@@ -107,8 +107,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor);
Tensor::MappingGuard input_guard(batch_tensor);
Tensor::MappingGuard output_guard(space_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
......@@ -129,8 +129,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
......@@ -144,34 +144,34 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(in_height,
std::min(
block_h + block_h_size,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
block_h + block_h_size,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
input_base[in_h * in_width + in_w];
w += block_shape_w;
} // w
h += block_shape_h;
......@@ -184,6 +184,93 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
}
};
template<>
struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NHWC,
output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(batch_tensor);
Tensor::MappingGuard output_guard(space_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
space_tensor->SetScale(batch_tensor->scale());
space_tensor->SetZeroPoint(batch_tensor->zero_point());
const uint8_t *input_data = batch_tensor->data<uint8_t>();
uint8_t *output_data = space_tensor->mutable_data<uint8_t>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(1);
index_t in_width = batch_tensor->dim(2);
index_t out_batches = space_tensor->dim(0);
index_t out_height = space_tensor->dim(1);
index_t out_width = space_tensor->dim(2);
index_t channels = space_tensor->dim(3);
#pragma omp parallel for
for (index_t in_b = 0; in_b < in_batches; ++in_b) {
const index_t b = in_b % out_batches;
const index_t tile_index = in_b / out_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(static_cast<index_t>(0),
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(in_height,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left
- tile_w
+ block_shape_w - 1)
/ block_shape_w);
const uint8_t *input_base =
input_data + in_b * in_height * in_width * channels;
uint8_t
*output_base = output_data + b * out_height * out_width * channels;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
memcpy(output_base + (h * out_width + w) * channels,
input_base + (in_h * in_width + in_w) * channels,
channels * sizeof(uint8_t));
w += block_shape_w;
} // w
h += block_shape_h;
} // h
} // b
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
......
......@@ -61,50 +61,72 @@ inline void ComputeInterpolationWeights(
}
}
inline float ComputeLerp(const float top_left,
const float top_right,
const float bottom_left,
const float bottom_right,
const float x_lerp,
const float y_lerp) {
template <typename T>
inline T ComputeLerp(const T top_left,
const T top_right,
const T bottom_left,
const T bottom_right,
const float x_lerp,
const float y_lerp);
template <>
inline float ComputeLerp<float>(const float top_left,
const float top_right,
const float bottom_left,
const float bottom_right,
const float x_lerp,
const float y_lerp) {
const float top = top_left + (top_right - top_left) * x_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
return top + (bottom - top) * y_lerp;
}
inline void ResizeImage(const float *images,
const index_t batch_size,
const index_t in_height,
const index_t in_width,
const index_t out_height,
const index_t out_width,
const index_t channels,
const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys,
float *output) {
template <>
inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
const uint8_t top_right,
const uint8_t bottom_left,
const uint8_t bottom_right,
const float x_lerp,
const float y_lerp) {
const float top = top_left + (top_right - top_left) * x_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
return static_cast<uint8_t>(roundf(top + (bottom - top) * y_lerp));
}
template <typename T>
inline void ResizeImageNCHW(const T *images,
const index_t batch_size,
const index_t in_height,
const index_t in_width,
const index_t out_height,
const index_t out_width,
const index_t channels,
const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys,
T *output) {
const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) {
const float
const T
*channel_input_ptr =
images + (b * channels + c) * in_height * in_width;
float *channel_output_ptr =
T *channel_output_ptr =
output + (b * channels + c) * out_height * out_width;
for (index_t y = 0; y < out_height; ++y) {
const float *y_lower_input_ptr =
const T *y_lower_input_ptr =
channel_input_ptr + ys[y].lower * in_width;
const float *y_upper_input_ptr =
const T *y_upper_input_ptr =
channel_input_ptr + ys[y].upper * in_width;
const float ys_lerp = ys[y].lerp;
for (index_t x = 0; x < out_width; ++x) {
const float xs_lerp = xs[x].lerp;
const float top_left = y_lower_input_ptr[xs[x].lower];
const float top_right = y_lower_input_ptr[xs[x].upper];
const float bottom_left = y_upper_input_ptr[xs[x].lower];
const float bottom_right = y_upper_input_ptr[xs[x].upper];
const T top_left = y_lower_input_ptr[xs[x].lower];
const T top_right = y_lower_input_ptr[xs[x].upper];
const T bottom_left = y_upper_input_ptr[xs[x].lower];
const T bottom_right = y_upper_input_ptr[xs[x].upper];
channel_output_ptr[y * out_width + x] =
ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp);
......@@ -114,6 +136,48 @@ inline void ResizeImage(const float *images,
}
}
template <typename T>
inline void ResizeImageNHWC(const T *images,
const index_t batch_size,
const index_t in_height,
const index_t in_width,
const index_t out_height,
const index_t out_width,
const index_t channels,
const std::vector<CachedInterpolation> &xs_vec,
const std::vector<CachedInterpolation> &ys,
T *output) {
const CachedInterpolation *xs = xs_vec.data();
for (index_t b = 0; b < batch_size; ++b) {
const T *input_base = images + b * channels * in_height * in_width;
T *output_base = output + b * channels * out_height * out_width;
#pragma omp parallel for
for (index_t y = 0; y < out_height; ++y) {
const T
*y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
const T
*y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
const float ys_lerp = ys[y].lerp;
for (index_t x = 0; x < out_width; ++x) {
const float xs_lerp = xs[x].lerp;
const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
T *output_ptr = output_base + (y * out_width + x) * channels;
for (index_t c = 0; c < channels; ++c) {
output_ptr[c] =
ComputeLerp(top_left[c], top_right[c], bottom_left[c],
bottom_right[c], xs_lerp, ys_lerp);
}
}
}
}
}
struct ResizeBilinearFunctorBase : OpKernel {
ResizeBilinearFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size,
......@@ -132,11 +196,7 @@ struct ResizeBilinearFunctorBase : OpKernel {
};
template<DeviceType D, typename T>
struct ResizeBilinearFunctor;
template<>
struct ResizeBilinearFunctor<DeviceType::CPU, float>
: ResizeBilinearFunctorBase {
struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
......@@ -159,8 +219,8 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
if (out_height == in_height && out_width == in_width) {
std::copy(input_data,
......@@ -181,8 +241,77 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
channels, xs, ys, output_data);
ResizeImageNCHW(input_data,
batch,
in_height,
in_width,
out_height,
out_width,
channels,
xs,
ys,
output_data);
return MACE_SUCCESS;
}
};
template<DeviceType D>
struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
index_t out_height = out_height_;
index_t out_width = out_width_;
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> out_shape{batch, out_height, out_width, channels};
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard output_mapper(output);
const uint8_t *input_data = input->data<uint8_t>();
uint8_t *output_data = output->mutable_data<uint8_t>();
if (out_height == in_height && out_width == in_width) {
std::copy(input_data,
input_data + batch * in_height * in_width * channels ,
output_data);
return MACE_SUCCESS;
}
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
std::vector<CachedInterpolation> ys(out_height + 1);
std::vector<CachedInterpolation> xs(out_width + 1);
// Compute the cached interpolation weights on the x and y dimensions.
ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
ResizeImageNHWC(input_data,
batch,
in_height,
in_width,
out_height,
out_width,
channels,
xs,
ys,
output_data);
return MACE_SUCCESS;
}
......
......@@ -34,12 +34,12 @@ struct SpaceToBatchFunctorBase : OpKernel {
SpaceToBatchFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) {
: OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) {
MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1");
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
}
......@@ -100,11 +100,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {}
: SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
......@@ -135,7 +135,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t out_width = batch_tensor->dim(3);
index_t block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
......@@ -149,27 +149,27 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(out_height,
std::min(
block_h + block_h_size,
(in_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
block_h + block_h_size,
(in_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width,
0,
......@@ -184,7 +184,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
input_base[in_h * in_width + in_w];
in_w += block_shape_w;
} // w
in_h += block_shape_h;
......@@ -197,7 +197,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
memset(output_base + valid_h_end * out_width,
0,
(std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float));
* out_width * sizeof(float));
} // b
} // block_h
} // c
......@@ -205,6 +205,111 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
}
};
template<>
struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NHWC,
output_shape.data());
MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
int zero_point = space_tensor->zero_point();
Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
batch_tensor->SetScale(space_tensor->scale());
batch_tensor->SetZeroPoint(space_tensor->zero_point());
const uint8_t *input_data = space_tensor->data<uint8_t>();
uint8_t *output_data = batch_tensor->mutable_data<uint8_t>();
index_t in_batches = space_tensor->dim(0);
index_t in_height = space_tensor->dim(1);
index_t in_width = space_tensor->dim(2);
index_t out_batches = batch_tensor->dim(0);
index_t out_height = batch_tensor->dim(1);
index_t out_width = batch_tensor->dim(2);
index_t channels = batch_tensor->dim(3);
#pragma omp parallel for
for (index_t b = 0; b < out_batches; ++b) {
const index_t in_b = b % in_batches;
const index_t tile_index = b / in_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(static_cast<index_t>(0),
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(out_height,
(in_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const uint8_t *input_base =
input_data + in_b * channels * in_height * in_width;
uint8_t *output_base =
output_data + b * channels * out_height * out_width;
memset(output_base,
zero_point,
valid_h_start * out_width * channels * sizeof(uint8_t));
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) {
memset(output_base + h * out_width * channels,
zero_point,
valid_w_start * channels * sizeof(uint8_t));
index_t
in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) {
memcpy(output_base + (h * out_width + w) * channels,
input_base + (in_h * in_width + in_w) * channels,
sizeof(uint8_t) * channels);
in_w += block_shape_w;
} // w
in_h += block_shape_h;
memset(output_base + (h * out_width + valid_w_end) * channels,
zero_point,
(out_width - valid_w_end) * channels * sizeof(uint8_t));
} // h
memset(output_base + valid_h_end * out_width * channels,
zero_point,
(out_height - valid_h_end) * out_width * channels
* sizeof(uint8_t));
} // b
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
......
......@@ -23,6 +23,11 @@ void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) {
.TypeConstraint<float>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
BatchToSpaceNDOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
.Device(DeviceType::GPU)
......
......@@ -28,6 +28,11 @@ void Register_Concat(OperatorRegistryBase *op_registry) {
.TypeConstraint<int32_t>("T")
.Build(),
ConcatOp<DeviceType::CPU, int32_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
ConcatOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
.Device(DeviceType::GPU)
......
......@@ -162,11 +162,13 @@ class OpsTestNet {
std::fill(input_data, input_data + input->size(), data);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void AddRandomInput(const std::string &name,
const std::vector<index_t> &shape,
bool positive = true,
bool truncate = false) {
bool truncate = false,
const float truncate_min = 0.001f,
const float truncate_max = 100.f) {
Tensor *input =
ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
......@@ -180,21 +182,22 @@ class OpsTestNet {
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(
input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] {
[&gen, &nd, positive, truncate, truncate_min, truncate_max] {
float d = nd(gen);
if (truncate) {
if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f;
if (std::abs(d) > truncate_max) d = truncate_max;
if (std::abs(d) < truncate_min) d = truncate_min;
}
return half_float::half_cast<half>(positive ? std::abs(d) : d);
});
} else {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive, truncate] {
[&gen, &nd, positive, truncate,
truncate_min, truncate_max] {
float d = nd(gen);
if (truncate) {
if (std::abs(d) > 100.f) d = 100.f;
if (std::abs(d) < 0.001f) d = 0.001f;
if (std::abs(d) > truncate_max) d = truncate_max;
if (std::abs(d) < truncate_min) d = truncate_min;
}
return (positive ? std::abs(d) : d);
});
......
......@@ -24,6 +24,12 @@ void Register_ResizeBilinear(OperatorRegistryBase *op_registry) {
.Build(),
ResizeBilinearOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
ResizeBilinearOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
.Device(DeviceType::GPU)
......
......@@ -36,8 +36,15 @@ void ResizeBilinearBenchmark(int iters,
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input",
{batch, channels, input_height, input_width});
if (DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, float>("Input",
{batch, channels, input_height,
input_width});
} else {
net.AddRandomInput<D, uint8_t>("Input",
{batch, input_height, input_width,
channels});
}
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input",
{batch, input_height, input_width, channels});
......@@ -99,6 +106,7 @@ void ResizeBilinearBenchmark(int iters,
#define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, uint8_t, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
......
......@@ -140,12 +140,89 @@ void TestRandomResizeBilinear() {
1e-6);
}
}
void TestQuantizedResizeBilinear() {
testing::internal::LogToStderr();
static unsigned int seed = time(NULL);
for (int round = 0; round < 10; ++round) {
int batch = 1 + rand_r(&seed) % 5;
int channels = 1 + rand_r(&seed) % 100;
int height = 1 + rand_r(&seed) % 100;
int width = 1 + rand_r(&seed) % 100;
int in_height = 1 + rand_r(&seed) % 100;
int in_width = 1 + rand_r(&seed) % 100;
int align_corners = rand_r(&seed) % 1;
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>("Input",
{batch, in_height, in_width, channels},
false,
true,
-1.f,
1.f);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntArg("align_corners", align_corners)
.AddIntsArg("size", {height, width})
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("Output"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
}
} // namespace
TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::GPU>();
}
TEST_F(ResizeBilinearTest, QuantizedResizeBilinear) {
TestQuantizedResizeBilinear();
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -23,6 +23,11 @@ void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) {
.TypeConstraint<float>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
SpaceToBatchNDOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
.Device(DeviceType::GPU)
......
......@@ -28,7 +28,13 @@ void BMSpaceToBatch(
OpsTestNet net;
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
if (DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, float>(
"Input", {batch, channels, height, width});
} else {
net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Input", {batch, height, width, channels});
}
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
}
......@@ -39,6 +45,7 @@ void BMSpaceToBatch(
.Output("Output")
.AddIntsArg("paddings", {shape, shape, shape, shape})
.AddIntsArg("block_shape", {shape, shape})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
......@@ -78,7 +85,8 @@ void BMSpaceToBatch(
#define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); \
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, uint8_t, CPU);
MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
......
......@@ -147,62 +147,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data,
padding_data, space_tensor.get());
}
} // namespace
TEST(SpaceToBatchTest, SmallData) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
{1, 0, 1, 0}, {9, 1, 1, 1},
{0, 0, 0, 0, 1, 2, 0, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
TestBidirectionalTransform<float>(
{1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
{0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
}
TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
TestBidirectionalTransform<float>(
{1, 2, 10, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
{2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
}
TEST(SpaceToBatchTest, MultiChannelData) {
TestBidirectionalTransform<float>(
{1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
TEST(SpaceToBatchTest, LargerMultiChannelData) {
TestBidirectionalTransform<float>(
{1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
}
void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
......@@ -278,6 +223,173 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
*net.GetOutput("OutputGPU"));
}
void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) {
OpsTestNet net;
net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) {
OpsTestNet net;
net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("QuantizedInput")
.Output("QuantizedOutput")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *eq_output = net.GetTensor("QuantizedInput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
} // namespace
TEST(SpaceToBatchTest, SmallData) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
{1, 0, 1, 0}, {9, 1, 1, 1},
{0, 0, 0, 0, 1, 2, 0, 3, 4});
}
TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
TestBidirectionalTransform<float>(
{1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
{0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
}
TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
TestBidirectionalTransform<float>(
{1, 2, 10, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
{2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
}
TEST(SpaceToBatchTest, MultiChannelData) {
TestBidirectionalTransform<float>(
{1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
{0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
}
TEST(SpaceToBatchTest, LargerMultiChannelData) {
TestBidirectionalTransform<float>(
{1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
{1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
}
TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
TestBidirectionalTransform<float>(
{2, 2, 4, 2},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
{2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
{1, 2, 5, 6, 17, 18, 21, 22, 3, 4, 7, 8, 19, 20, 23, 24,
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
}
TEST(SpaceToBatchTest, LargeData) {
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
......@@ -285,6 +397,13 @@ TEST(SpaceToBatchTest, LargeData) {
TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
}
TEST(SpaceToBatchTest, Quantize) {
TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0});
TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
}
} // namespace test
} // namespace ops
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册