提交 98c8dab1 编写于 作者: 刘琦

Merge branch 'remove-d2s-b2s-b2i' into 'master'

remove d2s b2s for depth_to_space and batch_to_space

See merge request !774
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_
#define MACE_KERNELS_BATCH_TO_SPACE_H_
#include <memory>
#include <vector>
#include <algorithm>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
struct BatchToSpaceFunctorBase : OpKernel {
BatchToSpaceFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()) {
MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
}
std::vector<int> paddings_;
std::vector<int> block_shape_;
protected:
void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
const DataFormat data_format,
index_t *output_shape) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
index_t batch = input_tensor->dim(0);
index_t channels = 0;
index_t height = 0;
index_t width = 0;
if (data_format == DataFormat::NHWC) {
height = input_tensor->dim(1);
width = input_tensor->dim(2);
channels = input_tensor->dim(3);
} else if (data_format == DataFormat::NCHW) {
height = input_tensor->dim(2);
width = input_tensor->dim(3);
channels = input_tensor->dim(1);
} else {
MACE_NOT_IMPLEMENTED;
}
index_t new_batch = batch / block_shape_[0] / block_shape_[1];
index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1];
index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3];
if (data_format == DataFormat::NHWC) {
output_shape[0] = new_batch;
output_shape[1] = new_height;
output_shape[2] = new_width;
output_shape[3] = channels;
} else {
output_shape[0] = new_batch;
output_shape[1] = channels;
output_shape[2] = new_height;
output_shape[3] = new_width;
}
}
};
template<DeviceType D, typename T>
struct BatchToSpaceFunctor;
template<>
struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor,
DataFormat::NCHW,
output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
const float *input_data = batch_tensor->data<float>();
float *output_data = space_tensor->mutable_data<float>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(2);
index_t in_width = batch_tensor->dim(3);
index_t out_batches = space_tensor->dim(0);
index_t channels = space_tensor->dim(1);
index_t out_height = space_tensor->dim(2);
index_t out_width = space_tensor->dim(3);
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < in_height;
block_h += block_h_size) {
for (index_t in_b = 0; in_b < in_batches; ++in_b) {
const index_t b = in_b % out_batches;
const index_t tile_index = in_b / out_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(in_height,
std::min(
block_h + block_h_size,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
w += block_shape_w;
} // w
h += block_shape_h;
} // h
} // b
} // block_h
} // c
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> space_shape_;
};
#endif // MACE_ENABLE_OPENCL
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_BATCH_TO_SPACE_H_
...@@ -32,9 +32,8 @@ namespace kernels { ...@@ -32,9 +32,8 @@ namespace kernels {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct DepthToSpaceOpFunctor : OpKernel { struct DepthToSpaceOpFunctor : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context, DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size, const int block_size)
bool d2s) : OpKernel(context), block_size_(block_size) {}
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -44,17 +43,13 @@ struct DepthToSpaceOpFunctor : OpKernel { ...@@ -44,17 +43,13 @@ struct DepthToSpaceOpFunctor : OpKernel {
const index_t input_height = input->dim(2); const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3); const index_t input_width = input->dim(3);
index_t output_depth, output_width, output_height; MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size",
input_depth);
if (d2s_) { const index_t output_depth = input_depth / (block_size_ * block_size_);
output_depth = input_depth / (block_size_ * block_size_); const index_t output_width = input_width * block_size_;
output_width = input_width * block_size_; const index_t output_height = input_height * block_size_;
output_height = input_height * block_size_;
} else {
output_depth = input_depth * block_size_ * block_size_;
output_width = input_width / block_size_;
output_height = input_height / block_size_;
}
std::vector<index_t> output_shape = {batch_size, output_depth, std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width}; output_height, output_width};
...@@ -65,78 +60,49 @@ struct DepthToSpaceOpFunctor : OpKernel { ...@@ -65,78 +60,49 @@ struct DepthToSpaceOpFunctor : OpKernel {
const T *input_ptr = input->data<T>(); const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
if (d2s_) {
#pragma omp parallel for
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) {
for (index_t h = 0; h < output_height; ++h) {
const index_t in_h = h / block_size_;
const index_t offset_h = (h % block_size_);
for (int w = 0; w < output_width; ++w) {
const index_t in_w = w / block_size_;
const index_t offset_w = w % block_size_;
const index_t offset_d =
(offset_h * block_size_ + offset_w) * output_depth;
const index_t in_d = d + offset_d;
const index_t o_index =
((b * output_depth + d) * output_height + h) * output_width
+ w;
const index_t i_index =
((b * input_depth + in_d) * input_height + in_h) * input_width
+ in_w;
output_ptr[o_index] = input_ptr[i_index];
}
}
}
}
} else {
#pragma omp parallel for #pragma omp parallel for
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < input_depth; ++d) { for (index_t d = 0; d < output_depth; ++d) {
for (index_t h = 0; h < input_height; ++h) { for (index_t h = 0; h < output_height; ++h) {
const index_t out_h = h / block_size_; const index_t in_h = h / block_size_;
const index_t offset_h = (h % block_size_); const index_t offset_h = (h % block_size_);
for (index_t w = 0; w < input_width; ++w) { for (int w = 0; w < output_width; ++w) {
const index_t out_w = w / block_size_; const index_t in_w = w / block_size_;
const index_t offset_w = (w % block_size_); const index_t offset_w = w % block_size_;
const index_t offset_d = const index_t offset_d =
(offset_h * block_size_ + offset_w) * input_depth; (offset_h * block_size_ + offset_w) * output_depth;
const index_t out_d = d + offset_d; const index_t in_d = d + offset_d;
const index_t o_index = const index_t o_index =
((b * output_depth + out_d) * output_height + out_h) ((b * output_depth + d) * output_height + h) * output_width
* output_width + out_w; + w;
const index_t i_index = const index_t i_index =
((b * input_depth + d) * input_height + h) * input_width ((b * input_depth + in_d) * input_height + in_h) * input_width
+ w; + in_w;
output_ptr[o_index] = input_ptr[i_index]; output_ptr[o_index] = input_ptr[i_index];
}
} }
} }
} }
} }
return MACE_SUCCESS; return MACE_SUCCESS;
} }
const int block_size_; const int block_size_;
bool d2s_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel { struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context, DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size, const int block_size)
bool d2s) : OpKernel(context), block_size_(block_size) {}
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
const int block_size_; const int block_size_;
bool d2s_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_; std::unique_ptr<BufferBase> kernel_error_;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#include "mace/kernels/batch_to_space.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <typename T>
MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
output_shape.data());
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape_[0]);
kernel_.setArg(idx++, block_shape_[1]);
kernel_.setArg(idx++, paddings_[0]);
kernel_.setArg(idx++, paddings_[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
template struct BatchToSpaceFunctor<DeviceType::GPU, float>;
template struct BatchToSpaceFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#include <common.h>
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
__private const int block_width,
__private const int padding_height,
__private const int padding_width,
__private const int batch_size,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
__private const int batch_width) {
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height;
const int block_size = mul24(block_height, block_width);
const int space_b_idx = batch_b_idx % batch_size;
const int remaining_batch_idx = batch_b_idx / batch_size;
const int space_h_idx = (remaining_batch_idx / block_width) +
mul24(batch_h_idx, block_height) - padding_height;
const int space_w_idx = (remaining_batch_idx % block_width) +
mul24(batch_w_idx, block_width) - padding_width;
if (0 <= space_w_idx && space_w_idx < space_width &&
0 <= space_h_idx && space_h_idx < space_height) {
int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
space_b_idx * space_height + space_h_idx);
WRITE_IMAGET(space_data, space_coord, value);
}
}
...@@ -39,43 +39,3 @@ __kernel void depth_to_space(KERNEL_ERROR_PARAMS ...@@ -39,43 +39,3 @@ __kernel void depth_to_space(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
} }
__kernel void space_to_depth(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
__private const int input_width,
__private const int input_depth_blocks,
__private const int output_hb,
__private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_hb = hb / block_size;
const int offset_h = hb % block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
return;
}
const int out_pos = mad24(out_d, output_width, out_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
}
...@@ -49,50 +49,3 @@ __kernel void space_to_batch(KERNEL_ERROR_PARAMS ...@@ -49,50 +49,3 @@ __kernel void space_to_batch(KERNEL_ERROR_PARAMS
WRITE_IMAGET(batch_data, batch_coord, value); WRITE_IMAGET(batch_data, batch_coord, value);
} }
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
__private const int block_width,
__private const int padding_height,
__private const int padding_width,
__private const int batch_size,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
__private const int batch_width) {
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height;
const int block_size = mul24(block_height, block_width);
const int space_b_idx = batch_b_idx % batch_size;
const int remaining_batch_idx = batch_b_idx / batch_size;
const int space_h_idx = (remaining_batch_idx / block_width) +
mul24(batch_h_idx, block_height) - padding_height;
const int space_w_idx = (remaining_batch_idx % block_width) +
mul24(batch_w_idx, block_width) - padding_width;
if (0 <= space_w_idx && space_w_idx < space_width &&
0 <= space_h_idx && space_h_idx < space_height) {
int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
space_b_idx * space_height + space_h_idx);
WRITE_IMAGET(space_data, space_coord, value);
}
}
#include <common.h>
__kernel void space_to_depth(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
__private const int input_width,
__private const int input_depth_blocks,
__private const int output_hb,
__private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_hb = hb / block_size;
const int offset_h = hb % block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
return;
}
const int out_pos = mad24(out_d, output_width, out_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
}
...@@ -30,54 +30,41 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -30,54 +30,41 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const index_t input_width = input->dim(2); const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3); const index_t input_depth = input->dim(3);
const char *kernel_name = nullptr; MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size",
input_depth);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth;
uint32_t gws[3];
std::string tuning_key;
index_t output_height, output_width, output_depth;
if (d2s_) {
output_height = input_height * block_size_;
output_width = input_width * block_size_;
output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth;
kernel_name = "depth_to_space";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
output_width, output_depth);
} else {
output_height = input_height / block_size_;
output_width = input_width / block_size_;
output_depth = input_depth * block_size_ * block_size_;
MACE_CHECK(input_depth % 4 == 0, "input channel not support:")
<< input_depth;
kernel_name = "space_to_depth";
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
}
const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width, std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth}; output_depth};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output_depth)),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)
};
auto runtime = context_->device()->opencl_runtime(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
const char *kernel_name = kernel_name = "depth_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
...@@ -89,7 +76,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -89,7 +76,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, built_options,
&kernel_)); &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
...@@ -99,26 +85,20 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -99,26 +85,20 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
OUT_OF_RANGE_SET_ARG; OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_); SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (d2s_) { kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_)); kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch)); kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_width)); kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_width)); kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
} else {
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
}
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
std::string tuning_key = Concat("depth_to_space_opencl_kernel",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
......
...@@ -28,27 +28,14 @@ template <typename T> ...@@ -28,27 +28,14 @@ template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
if (b2s_) { CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, output_shape.data());
output_shape.data());
} else {
CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
output_shape.data());
}
const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape); &output_image_shape);
if (b2s_) { MACE_RETURN_IF_ERROR(
MACE_RETURN_IF_ERROR( batch_tensor->ResizeImage(output_shape, output_image_shape));
space_tensor->ResizeImage(output_shape, output_image_shape)); const char *kernel_name = "space_to_batch";
kernel_name = "batch_to_space";
} else {
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
kernel_name = "space_to_batch";
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = { const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)), chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
...@@ -79,13 +66,9 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -79,13 +66,9 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG; OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_); SET_3D_GWS_ARGS(kernel_);
if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image())); kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
} else {
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
}
kernel_.setArg(idx++, block_shape_[0]); kernel_.setArg(idx++, block_shape_[0]);
kernel_.setArg(idx++, block_shape_[1]); kernel_.setArg(idx++, block_shape_[1]);
kernel_.setArg(idx++, paddings_[0]); kernel_.setArg(idx++, paddings_[0]);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/space_to_depth.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <typename T>
MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
static_cast<uint32_t>(input_width),
static_cast<uint32_t>(input_height * batch)};
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>;
template struct SpaceToDepthOpFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
...@@ -33,12 +33,10 @@ namespace kernels { ...@@ -33,12 +33,10 @@ namespace kernels {
struct SpaceToBatchFunctorBase : OpKernel { struct SpaceToBatchFunctorBase : OpKernel {
SpaceToBatchFunctorBase(OpKernelContext *context, SpaceToBatchFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape)
bool b2s)
: OpKernel(context), : OpKernel(context),
paddings_(paddings.begin(), paddings.end()), paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()), block_shape_(block_shape.begin(), block_shape.end()) {
b2s_(b2s) {
MACE_CHECK( MACE_CHECK(
block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
"Block's shape should be 1D, and greater than 1"); "Block's shape should be 1D, and greater than 1");
...@@ -47,7 +45,6 @@ struct SpaceToBatchFunctorBase : OpKernel { ...@@ -47,7 +45,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
std::vector<int> paddings_; std::vector<int> paddings_;
std::vector<int> block_shape_; std::vector<int> block_shape_;
bool b2s_;
protected: protected:
void CalculateSpaceToBatchOutputShape(const Tensor *input_tensor, void CalculateSpaceToBatchOutputShape(const Tensor *input_tensor,
...@@ -93,43 +90,6 @@ struct SpaceToBatchFunctorBase : OpKernel { ...@@ -93,43 +90,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
output_shape[3] = new_width; output_shape[3] = new_width;
} }
} }
void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
const DataFormat data_format,
index_t *output_shape) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
index_t batch = input_tensor->dim(0);
index_t channels = 0;
index_t height = 0;
index_t width = 0;
if (data_format == DataFormat::NHWC) {
height = input_tensor->dim(1);
width = input_tensor->dim(2);
channels = input_tensor->dim(3);
} else if (data_format == DataFormat::NCHW) {
height = input_tensor->dim(2);
width = input_tensor->dim(3);
channels = input_tensor->dim(1);
} else {
MACE_NOT_IMPLEMENTED;
}
index_t new_batch = batch / block_shape_[0] / block_shape_[1];
index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1];
index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3];
if (data_format == DataFormat::NHWC) {
output_shape[0] = new_batch;
output_shape[1] = new_height;
output_shape[2] = new_width;
output_shape[3] = channels;
} else {
output_shape[0] = new_batch;
output_shape[1] = channels;
output_shape[2] = new_height;
output_shape[3] = new_width;
}
}
}; };
template<DeviceType D, typename T> template<DeviceType D, typename T>
...@@ -139,9 +99,8 @@ template<> ...@@ -139,9 +99,8 @@ template<>
struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape)
bool b2s) : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
: SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
...@@ -149,17 +108,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -149,17 +108,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
MACE_UNUSED(future); MACE_UNUSED(future);
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
if (b2s_) {
CalculateBatchToSpaceOutputShape(batch_tensor, CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW, DataFormat::NCHW,
output_shape.data()); output_shape.data());
MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
} else {
CalculateSpaceToBatchOutputShape(space_tensor,
DataFormat::NCHW,
output_shape.data());
MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
}
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor); Tensor::MappingGuard output_guard(batch_tensor);
...@@ -169,152 +122,85 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -169,152 +122,85 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
int block_shape_h = block_shape_[0]; int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1]; int block_shape_w = block_shape_[1];
if (b2s_) { const float *input_data = space_tensor->data<float>();
const float *input_data = batch_tensor->data<float>(); float *output_data = batch_tensor->mutable_data<float>();
float *output_data = space_tensor->mutable_data<float>();
index_t in_batches = batch_tensor->dim(0); index_t in_batches = space_tensor->dim(0);
index_t in_height = batch_tensor->dim(2); index_t in_height = space_tensor->dim(2);
index_t in_width = batch_tensor->dim(3); index_t in_width = space_tensor->dim(3);
index_t out_batches = space_tensor->dim(0); index_t out_batches = batch_tensor->dim(0);
index_t channels = space_tensor->dim(1); index_t channels = batch_tensor->dim(1);
index_t out_height = space_tensor->dim(2); index_t out_height = batch_tensor->dim(2);
index_t out_width = space_tensor->dim(3); index_t out_width = batch_tensor->dim(3);
// 32k/sizeof(float)/out_width/block_shape index_t block_h_size =
index_t std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < in_height; for (index_t block_h = 0; block_h < out_height;
block_h += block_h_size) { block_h += block_h_size) {
for (index_t in_b = 0; in_b < in_batches; ++in_b) { for (index_t b = 0; b < out_batches; ++b) {
const index_t b = in_b % out_batches; const index_t in_b = b % in_batches;
const index_t tile_index = in_b / out_batches; const index_t tile_index = b / in_batches;
const index_t tile_h = tile_index / block_shape_w; const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w; const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h, const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h (pad_top - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h); / block_shape_h);
const index_t valid_h_end = std::min(in_height, const index_t valid_h_end = std::min(out_height,
std::min( std::min(
block_h + block_h_size, block_h + block_h_size,
(out_height + pad_top (in_height + pad_top
- tile_h - tile_h
+ block_shape_h - 1) + block_shape_h - 1)
/ block_shape_h)); / block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0), const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w (pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const float *input_base = const index_t valid_w_end = std::min(out_width,
input_data + (in_b * channels + c) * in_height * in_width; (in_width + pad_left - tile_w
float *output_base = + block_shape_w - 1)
output_data + (b * channels + c) * out_height * out_width; / block_shape_w);
const float *input_base =
index_t h = valid_h_start * block_shape_h + tile_h - pad_top; input_data + (in_b * channels + c) * in_height * in_width;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { float *output_base =
index_t w = valid_w_start * block_shape_w + tile_w - pad_left; output_data + (b * channels + c) * out_height * out_width;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
output_base[h * out_width + w] = memset(output_base + block_h * out_width,
input_base[in_h * in_width + in_w]; 0,
w += block_shape_w; (valid_h_start - block_h) * out_width * sizeof(float));
} // w
h += block_shape_h; index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
} // h for (index_t h = valid_h_start; h < valid_h_end; ++h) {
} // b memset(output_base + h * out_width,
} // block_h 0,
} // c valid_w_start * sizeof(float));
} else {
const float *input_data = space_tensor->data<float>();
float *output_data = batch_tensor->mutable_data<float>();
index_t in_batches = space_tensor->dim(0);
index_t in_height = space_tensor->dim(2);
index_t in_width = space_tensor->dim(3);
index_t out_batches = batch_tensor->dim(0);
index_t channels = batch_tensor->dim(1);
index_t out_height = batch_tensor->dim(2);
index_t out_width = batch_tensor->dim(3);
index_t block_h_size = index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width); for (index_t w = valid_w_start; w < valid_w_end; ++w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
in_w += block_shape_w;
} // w
in_h += block_shape_h;
// make channel outter loop so we can make best use of cache memset(output_base + h * out_width + valid_w_end,
#pragma omp parallel for collapse(3)
for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < out_height;
block_h += block_h_size) {
for (index_t b = 0; b < out_batches; ++b) {
const index_t in_b = b % in_batches;
const index_t tile_index = b / in_batches;
const index_t tile_h = tile_index / block_shape_w;
const index_t tile_w = tile_index % block_shape_w;
const index_t valid_h_start = std::max(block_h,
(pad_top - tile_h
+ block_shape_h - 1)
/ block_shape_h);
const index_t valid_h_end = std::min(out_height,
std::min(
block_h + block_h_size,
(in_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width,
0, 0,
(valid_h_start - block_h) * out_width * sizeof(float)); (out_width - valid_w_end) * sizeof(float));
} // h
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) { memset(output_base + valid_h_end * out_width,
memset(output_base + h * out_width, 0,
0, (std::min(out_height, block_h + block_h_size) - valid_h_end)
valid_w_start * sizeof(float)); * out_width * sizeof(float));
} // b
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; } // block_h
for (index_t w = valid_w_start; w < valid_w_end; ++w) { } // c
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
in_w += block_shape_w;
} // w
in_h += block_shape_h;
memset(output_base + h * out_width + valid_w_end,
0,
(out_width - valid_w_end) * sizeof(float));
} // h
memset(output_base + valid_h_end * out_width,
0,
(std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float));
} // b
} // block_h
} // c
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
}; };
...@@ -324,9 +210,8 @@ template <typename T> ...@@ -324,9 +210,8 @@ template <typename T>
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape)
bool b2s) : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
: SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_
#define MACE_KERNELS_SPACE_TO_DEPTH_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
struct SpaceToDepthOpFunctor : OpKernel {
SpaceToDepthOpFunctor(OpKernelContext *context,
const int block_size)
: OpKernel(context), block_size_(block_size) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
const index_t batch_size = input->dim(0);
const index_t input_depth = input->dim(1);
const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3);
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_height = input_height / block_size_;
std::vector<index_t> output_shape = {batch_size, output_depth,
output_height, output_width};
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < input_depth; ++d) {
for (index_t h = 0; h < input_height; ++h) {
const index_t out_h = h / block_size_;
const index_t offset_h = (h % block_size_);
for (index_t w = 0; w < input_width; ++w) {
const index_t out_w = w / block_size_;
const index_t offset_w = (w % block_size_);
const index_t offset_d =
(offset_h * block_size_ + offset_w) * input_depth;
const index_t out_d = d + offset_d;
const index_t o_index =
((b * output_depth + out_d) * output_height + out_h)
* output_width + out_w;
const index_t i_index =
((b * input_depth + d) * input_height + h) * input_width + w;
output_ptr[o_index] = input_ptr[i_index];
}
}
}
}
return MACE_SUCCESS;
}
const int block_size_;
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel {
explicit SpaceToDepthOpFunctor(OpKernelContext *context,
const int block_size)
: OpKernel(context), block_size_(block_size) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
const int block_size_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
};
#endif // MACE_ENABLE_OPENCL
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_SPACE_TO_DEPTH_H_
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/space_to_batch.h" #include "mace/kernels/batch_to_space.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -31,8 +31,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -31,8 +31,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
functor_(context, functor_(context,
OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}), OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
true) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
...@@ -41,7 +40,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -41,7 +40,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
} }
private: private:
kernels::SpaceToBatchFunctor<D, T> functor_; kernels::BatchToSpaceFunctor<D, T> functor_;
protected: protected:
MACE_OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
......
...@@ -30,26 +30,13 @@ class DepthToSpaceOp : public Operator<D, T> { ...@@ -30,26 +30,13 @@ class DepthToSpaceOp : public Operator<D, T> {
DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context) DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)), block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
functor_(context, this->block_size_, true) {} functor_(context, this->block_size_) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
int input_depth;
if (D == CPU) {
input_depth = input->dim(1);
} else if (D == GPU) {
input_depth = input->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size",
input_depth);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
return functor_(input, output, future); return functor_(input, output, future);
} }
......
...@@ -24,21 +24,18 @@ namespace test { ...@@ -24,21 +24,18 @@ namespace test {
namespace { namespace {
template <DeviceType D> template <DeviceType D>
void RunDepthToSpace(const bool d2s, void RunDepthToSpace(const std::vector<index_t> &input_shape,
const std::vector<index_t> &input_shape,
const std::vector<float> &input_data, const std::vector<float> &input_data,
const int block_size, const int block_size,
const std::vector<index_t> &expected_shape, const std::vector<index_t> &expected_shape,
const std::vector<float> &expected_data) { const std::vector<float> &expected_data) {
OpsTestNet net; OpsTestNet net;
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth";
const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
// Construct graph // Construct graph
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputNCHW") .Input("InputNCHW")
.Output("OutputNCHW") .Output("OutputNCHW")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
...@@ -51,7 +48,7 @@ void RunDepthToSpace(const bool d2s, ...@@ -51,7 +48,7 @@ void RunDepthToSpace(const bool d2s,
} else { } else {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
...@@ -69,47 +66,11 @@ void RunDepthToSpace(const bool d2s, ...@@ -69,47 +66,11 @@ void RunDepthToSpace(const bool d2s,
} }
} // namespace } // namespace
class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(
false, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(
false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>(
false, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
class DepthToSpaceOpTest : public OpsTestBase {}; class DepthToSpaceOpTest : public OpsTestBase {};
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>( RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 2, 16}, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 2, 4, 4}, 2, {1, 2, 4, 4},
...@@ -119,7 +80,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { ...@@ -119,7 +80,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>( RunDepthToSpace<DeviceType::GPU>(
true, {1, 1, 2, 16}, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 2, 4, 4}, 2, {1, 2, 4, 4},
...@@ -129,14 +90,14 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { ...@@ -129,14 +90,14 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>( RunDepthToSpace<DeviceType::CPU>(
true, {1, 1, 1, 16}, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>( RunDepthToSpace<DeviceType::GPU>(
true, {1, 1, 1, 16}, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
} }
...@@ -144,14 +105,13 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { ...@@ -144,14 +105,13 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0); const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, in, 2, RunDepthToSpace<DeviceType::GPU>({1, 192, 192, 128}, in, 2,
{1, 384, 384, 32}, in); {1, 384, 384, 32}, in);
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const bool d2s, void RandomTest(const int block_size,
const int block_size,
const std::vector<index_t> &shape) { const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
srand(time(NULL)); srand(time(NULL));
...@@ -159,14 +119,11 @@ void RandomTest(const bool d2s, ...@@ -159,14 +119,11 @@ void RandomTest(const bool d2s,
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth";
const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", shape); net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputNCHW") .Input("InputNCHW")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
.Output("OutputNCHW") .Output("OutputNCHW")
...@@ -181,7 +138,7 @@ void RandomTest(const bool d2s, ...@@ -181,7 +138,7 @@ void RandomTest(const bool d2s,
BufferToImage<D, T>(&net, "Input", "InputImg", BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder(ops_name, ops_test_name) OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImg") .Input("InputImg")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
...@@ -205,19 +162,11 @@ void RandomTest(const bool d2s, ...@@ -205,19 +162,11 @@ void RandomTest(const bool d2s,
} // namespace } // namespace
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 128});
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 128});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
} }
} // namespace test } // namespace test
......
...@@ -31,8 +31,7 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -31,8 +31,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
functor_(context, functor_(context,
OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}), OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
false) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *space_tensor = this->Input(INPUT); const Tensor *space_tensor = this->Input(INPUT);
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/depth_to_space.h" #include "mace/kernels/space_to_depth.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -30,34 +30,12 @@ class SpaceToDepthOp : public Operator<D, T> { ...@@ -30,34 +30,12 @@ class SpaceToDepthOp : public Operator<D, T> {
SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context) SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
functor_(context, functor_(context,
OperatorBase::GetOptionalArg<int>("block_size", 1), OperatorBase::GetOptionalArg<int>("block_size", 1)) {}
false) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
const int block_size = OperatorBase::GetOptionalArg<int>("block_size", 1);
index_t input_height;
index_t input_width;
index_t input_depth;
if (D == CPU) {
input_height = input->dim(2);
input_width = input->dim(3);
input_depth = input->dim(1);
} else if (D == GPU) {
input_height = input->dim(1);
input_width = input->dim(2);
input_depth = input->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size == 0) && (input_height % block_size == 0),
"input width and height should be dividable by block_size",
input->dim(3));
return functor_(input, output, future); return functor_(input, output, future);
} }
...@@ -66,7 +44,7 @@ class SpaceToDepthOp : public Operator<D, T> { ...@@ -66,7 +44,7 @@ class SpaceToDepthOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS(OUTPUT); MACE_OP_OUTPUT_TAGS(OUTPUT);
private: private:
kernels::DepthToSpaceOpFunctor<D, T> functor_; kernels::SpaceToDepthOpFunctor<D, T> functor_;
}; };
} // namespace ops } // namespace ops
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template <DeviceType D>
void RunSpaceToDepth(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const int block_size,
const std::vector<index_t> &expected_shape,
const std::vector<float> &expected_data) {
OpsTestNet net;
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
// Construct graph
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
}
auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
RunSpaceToDepth<DeviceType::GPU>(
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
RunSpaceToDepth<DeviceType::GPU>(
{1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
namespace {
template <DeviceType D, typename T>
void RandomTest(const int block_size,
const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
srand(time(NULL));
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputNCHW")
.AddIntArg("block_size", block_size)
.Output("OutputNCHW")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputImg")
.AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("OutputImg")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-5);
} else {
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
}
}
} // namespace
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 32});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 32});
}
} // namespace test
} // namespace ops
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册