提交 894096bd 编写于 作者: L liuqi

Feature: S2D and D2S support depth < 4.

上级 52a379a2
...@@ -164,6 +164,7 @@ MaceStatus NetDefAdapter::AdaptNetDef( ...@@ -164,6 +164,7 @@ MaceStatus NetDefAdapter::AdaptNetDef(
input_info->set_dims(j, input_shape[j]); input_info->set_dims(j, input_shape[j]);
} }
} }
tensor_shape_map.emplace(input_info->name(), input_shape);
output_map.emplace(input_info->name(), InternalOutputInfo( output_map.emplace(input_info->name(), InternalOutputInfo(
mem_type, input_info->data_type(), mem_type, input_info->data_type(),
input_data_format, input_shape, -1)); input_data_format, input_shape, -1));
......
#include <common.h> #include <common.h>
__kernel void depth_to_space_d1_d2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int input_height,
__private const int input_width,
__private const int block_size,
__private const int output_height,
__private const int output_width,
__private const int output_depth,
__write_only image2d_t output) {
const int in_depth_blk_idx = get_global_id(0);
const int in_width_idx = get_global_id(1);
const int in_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (in_depth_blk_idx >= global_size_dim0 || in_width_idx >= global_size_dim1
|| in_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_idx = in_hb_idx / input_height;
const int in_height_idx = in_hb_idx - mul24(batch_idx, input_height);
int in_depth_idx = in_depth_blk_idx << 2;
int hw_block_size = in_depth_idx / output_depth;
int out_depth_idx = in_depth_idx - mul24(hw_block_size, output_depth);
int bottom_width_idx = mul24(in_width_idx, block_size);
int out_width_idx = bottom_width_idx + (hw_block_size % block_size);
int out_height_idx = mad24(in_height_idx, block_size,
hw_block_size / block_size);
const int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_hb_idx));
int out_x_base = mul24((out_depth_idx >> 2), output_width);
int out_x = out_x_base + out_width_idx;
int out_y = mad24(batch_idx, output_height, out_height_idx);
#if defined(DEPTH1)
int top_width = mul24(in_width_idx + 1, block_size);
int top_height = mul24(in_height_idx + 1, block_size);
DATA_TYPE4 t_out_data = 0;
int t_out_x = out_x;
int t_width_idx = out_width_idx;
int t_height_idx = out_height_idx;
DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data);
for (int i = 0; i < 4; ++i) {
t_out_data.x = in_data_ptr[i];
WRITE_IMAGET(output, (int2)(t_out_x, out_y), t_out_data);
if (t_width_idx + 1 >= top_width) {
if (t_height_idx + 1 >= top_height) {
break;
}
t_width_idx = bottom_width_idx;
t_out_x = out_x_base + t_width_idx;
t_height_idx += 1;
out_y += 1;
} else {
t_width_idx += 1;
t_out_x += 1;
}
}
#elif defined(DEPTH2)
int top_width = mul24(in_width_idx + 1, block_size);
int top_height = mul24(in_height_idx + 1, block_size);
DATA_TYPE4 t_out_data = 0;
t_out_data.x = in_data.x;
t_out_data.y = in_data.y;
WRITE_IMAGET(output, (int2)(out_x, out_y), t_out_data);
t_out_data.x = in_data.z;
t_out_data.y = in_data.w;
if (out_width_idx + 1 >= top_width) {
if (out_height_idx + 1 < top_height) {
int t_out_x = out_x_base + bottom_width_idx;
WRITE_IMAGET(output, (int2)(t_out_x, out_y + 1), t_out_data);
}
} else {
WRITE_IMAGET(output, (int2)(out_x + 1, out_y), t_out_data);
}
#endif
}
__kernel void depth_to_space(OUT_OF_RANGE_PARAMS __kernel void depth_to_space(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int input_height,
__private const int input_hb,
__private const int input_width, __private const int input_width,
__private const int input_depth_blocks, __private const int block_size,
__private const int output_height,
__private const int output_width, __private const int output_width,
__private const int output_depth_blocks, __private const int output_depth,
__write_only image2d_t output) { __write_only image2d_t output) {
const int out_d = get_global_id(0); const int out_depth_blk_idx = get_global_id(0);
const int out_w = get_global_id(1); const int out_width_idx = get_global_id(1);
const int out_hb = get_global_id(2); const int out_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP #ifndef NON_UNIFORM_WORK_GROUP
if (out_d >= global_size_dim0 || out_w >= global_size_dim1 if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb_idx >= global_size_dim2) {
return; return;
} }
#endif #endif
const int batch_idx = out_hb_idx / output_height;
const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height);
const int out_pos = mad24(out_d, output_width, out_w); int in_height_idx = out_height_idx / block_size;
int height_idx_in_blk = out_height_idx - mul24(in_height_idx, block_size);
int in_width_idx = out_width_idx / block_size;
int width_idx_in_blk = out_width_idx - mul24(in_width_idx, block_size);
int in_depth_idx = mad24(
mad24(height_idx_in_blk, block_size, width_idx_in_blk),
output_depth, out_depth_blk_idx << 2);
const int in_hb = out_hb / block_size; int in_depth_blk_idx = in_depth_idx >> 2;
const int offset_h = out_hb - mul24(in_hb, block_size); int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx);
const int in_w = out_w / block_size; int in_y = mad24(batch_idx, input_height, in_height_idx);
const int offset_w = out_w - mul24(in_w, block_size); DATA_TYPE4 out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
const int offset_d = mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks);
const int in_d = out_d + offset_d;
if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) { #ifdef DEPTH3
return; DATA_TYPE4 t_out_data = out_data;
int left_part_size = 4 - (in_depth_idx & 0x3);
switch(left_part_size) {
case 1:
out_data.x = t_out_data.w;
break;
case 2:
out_data.x = t_out_data.z;
out_data.y = t_out_data.w;
break;
case 3:
out_data.x = t_out_data.y;
out_data.y = t_out_data.z;
out_data.z = t_out_data.w;
break;
case 4:
out_data.x = t_out_data.x;
out_data.y = t_out_data.y;
out_data.z = t_out_data.z;
break;
default:
out_data = 0;
} }
int right_part_size = 3 - left_part_size;
if (right_part_size > 0) {
int in_depth_blks = mul24(mul24(block_size, block_size), 3) >> 2;
in_x = select(-1, in_x + input_width, in_depth_blk_idx + 1 < in_depth_blks);
t_out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
switch (right_part_size) {
case 2:
out_data.y = t_out_data.x;
out_data.z = t_out_data.y;
break;
case 1:
out_data.z = t_out_data.x;
break;
default:
out_data = 0;
}
}
out_data.w = 0;
#endif
const int in_pos = mad24(in_d, input_width, in_w); int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb)); WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), out_data);
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
} }
...@@ -3,39 +3,130 @@ ...@@ -3,39 +3,130 @@
__kernel void space_to_depth(OUT_OF_RANGE_PARAMS __kernel void space_to_depth(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int input_height,
__private const int input_width, __private const int input_width,
__private const int input_depth_blocks, __private const int input_depth,
__private const int output_hb, __private const int block_size,
__private const int output_height,
__private const int output_width, __private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) { __write_only image2d_t output) {
const int d = get_global_id(0); const int out_depth_blk_idx = get_global_id(0);
const int w = get_global_id(1); const int out_width_idx = get_global_id(1);
const int hb = get_global_id(2); const int out_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP #ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1 if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
|| hb >= global_size_dim2) { || out_hb_idx >= global_size_dim2) {
return; return;
} }
#endif #endif
const int in_pos = mad24(d, input_width, w); const int batch_idx = out_hb_idx / output_height;
const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height);
const int out_hb = hb / block_size; int out_depth_idx = out_depth_blk_idx << 2;
const int offset_h = hb - mul24(out_hb, block_size); int in_depth_idx = out_depth_idx % input_depth;
const int out_w = w / block_size; int hw_block_size = out_depth_idx / input_depth;
const int offset_w = w - mul24(out_w, block_size); int bottom_width_idx = mul24(out_width_idx, block_size);
const int offset_d = mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w)); int in_width_idx = bottom_width_idx + (hw_block_size % block_size);
const int out_d = d + offset_d; int in_height_idx = mad24(out_height_idx, block_size,
hw_block_size / block_size);
if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { DATA_TYPE4 in_data = 0;
return; int in_x = mad24((in_depth_idx >> 2), input_width, in_width_idx);
int in_y = mad24(batch_idx, input_height, in_height_idx);
#if defined(DEPTH1)
int top_width = mul24(out_width_idx + 1, block_size);
int top_height = mul24(out_height_idx + 1, block_size);
DATA_TYPE4 t_in_data = 0;
int t_in_x = in_x;
int t_width_idx = in_width_idx;
int t_height_idx = in_height_idx;
DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data);
for (int i = 0; i < 4; ++i) {
t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y));
in_data_ptr[i] = t_in_data.x;
if (t_width_idx + 1 >= top_width) {
if (t_height_idx + 1 >= top_height) {
break;
}
t_width_idx = bottom_width_idx;
t_in_x = mad24((in_depth_idx >> 2), input_width, t_width_idx);
t_height_idx += 1;
in_y += 1;
} else {
t_width_idx += 1;
t_in_x += 1;
}
} }
#elif defined(DEPTH2)
int top_width = mul24(out_width_idx + 1, block_size);
int top_height = mul24(out_height_idx + 1, block_size);
DATA_TYPE4 t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
in_data.x = t_in_data.x;
in_data.y = t_in_data.y;
t_in_data = 0;
if (in_width_idx + 1 >= top_width) {
if (in_height_idx + 1 < top_height) {
int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx);
t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1));
}
} else {
t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y));
}
in_data.z = t_in_data.x;
in_data.w = t_in_data.y;
#elif defined(DEPTH3)
int top_width = mul24(out_width_idx + 1, block_size);
int top_height = mul24(out_height_idx + 1, block_size);
DATA_TYPE4 in_data0 = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
DATA_TYPE4 in_data1 = 0;
if (in_width_idx + 1 >= top_width) {
if (in_height_idx + 1 < top_height) {
int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx);
in_data1 = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1));
}
} else {
in_data1 = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y));
}
int left_part_size = 3 - in_depth_idx;
int right_part_size = 4 - left_part_size;
switch(left_part_size) {
case 3:
in_data.z = in_data0.z;
in_data.y = in_data0.y;
in_data.x = in_data0.x;
break;
case 2:
in_data.y = in_data0.z;
in_data.x = in_data0.y;
break;
case 1:
in_data.x = in_data0.z;
break;
default:
in_data = 0;
}
switch(right_part_size) {
case 3:
in_data.y = in_data1.x;
in_data.z = in_data1.y;
in_data.w = in_data1.z;
break;
case 2:
in_data.z = in_data1.x;
in_data.w = in_data1.y;
break;
case 1:
in_data.w = in_data1.x;
break;
default:
in_data = 0;
}
#else
in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
#endif
const int out_pos = mad24(out_d, output_width, out_w); const int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb)); WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), in_data);
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
} }
...@@ -58,19 +58,14 @@ MaceStatus DepthToSpaceKernel<T>::Compute( ...@@ -58,19 +58,14 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
const index_t input_depth = input->dim(3); const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size", "input depth should be dividable by block_size * block_size ",
input_depth); input_depth);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
const index_t output_height = input_height * block_size_; const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_; const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_); const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:") MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
<< output_depth; "output channel not support:") << output_depth;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, std::vector<index_t> output_shape = {batch,
output_height, output_height,
...@@ -82,11 +77,16 @@ MaceStatus DepthToSpaceKernel<T>::Compute( ...@@ -82,11 +77,16 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
&image_shape); &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const uint32_t gws[3] = { uint32_t gws[3];
static_cast<uint32_t>(RoundUpDiv4(output_depth)), if (output_depth < 3) {
static_cast<uint32_t>(output_width), gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
static_cast<uint32_t>(output_height * batch) gws[1] = static_cast<uint32_t>(input_width);
}; gws[2] = static_cast<uint32_t>(input_height * batch);
} else {
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
}
auto runtime = context->device()->gpu_runtime()->opencl_runtime(); auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION; MACE_OUT_OF_RANGE_DEFINITION;
...@@ -95,6 +95,10 @@ MaceStatus DepthToSpaceKernel<T>::Compute( ...@@ -95,6 +95,10 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
MACE_OUT_OF_RANGE_CONFIG; MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "depth_to_space"; const char *kernel_name = "depth_to_space";
if (output_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", output_depth));
if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
...@@ -116,20 +120,20 @@ MaceStatus DepthToSpaceKernel<T>::Compute( ...@@ -116,20 +120,20 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws); MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_)); kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(input_width)); kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width)); kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
std::string tuning_key = Concat("depth_to_space_opencl_kernel", std::string tuning_key = Concat("depth_to_space",
batch, output_height, batch, output_height,
output_width, output_depth); output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future())); gws, lws, context->future()));
......
...@@ -57,7 +57,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute( ...@@ -57,7 +57,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
const index_t input_width = input->dim(2); const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3); const index_t input_depth = input->dim(3);
MACE_CHECK((input_depth % 4) == 0, MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
"input channel should be dividable by 4"); "input channel should be dividable by 4");
MACE_CHECK( MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0), (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
...@@ -67,7 +67,6 @@ MaceStatus SpaceToDepthKernel<T>::Compute( ...@@ -67,7 +67,6 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
const index_t output_width = input_width / block_size_; const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_; const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width, std::vector<index_t> output_shape = {batch, output_height, output_width,
...@@ -90,6 +89,9 @@ MaceStatus SpaceToDepthKernel<T>::Compute( ...@@ -90,6 +89,9 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
if (input_depth < 4) {
built_options.emplace(MakeString("-DDEPTH", input_depth));
}
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
...@@ -102,28 +104,28 @@ MaceStatus SpaceToDepthKernel<T>::Compute( ...@@ -102,28 +104,28 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
static_cast<uint32_t>(input_width), static_cast<uint32_t>(output_width),
static_cast<uint32_t>(input_height * batch)}; static_cast<uint32_t>(output_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_); MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws); MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_)); kernel_.setArg(idx++, static_cast<int32_t>(input_height));
kernel_.setArg(idx++, static_cast<int32_t>(input_width)); kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch)); kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(output_height));
kernel_.setArg(idx++, static_cast<int32_t>(output_width)); kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), std::string tuning_key = Concat("space_to_depth", input->dim(0),
input->dim(1), input->dim(2), input->dim(3)); input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future())); gws, lws, context->future()));
......
...@@ -28,10 +28,7 @@ cc_binary( ...@@ -28,10 +28,7 @@ cc_binary(
"//mace/codegen:generated_models", "//mace/codegen:generated_models",
"//mace/libmace", "//mace/libmace",
"//mace/utils", "//mace/utils",
] + if_opencl_enabled([ ],
"//mace/codegen:generated_opencl_binary",
"//mace/codegen:generated_opencl_parameter",
]),
) )
cc_binary( cc_binary(
...@@ -60,10 +57,7 @@ cc_binary( ...@@ -60,10 +57,7 @@ cc_binary(
"//mace/codegen:generated_mace_engine_factory", "//mace/codegen:generated_mace_engine_factory",
"//mace/codegen:generated_models", "//mace/codegen:generated_models",
"//mace/libmace:libmace_dynamic", "//mace/libmace:libmace_dynamic",
] + if_opencl_enabled([ ],
"//mace/codegen:generated_opencl_binary",
"//mace/codegen:generated_opencl_parameter",
]),
) )
cc_library( cc_library(
......
...@@ -78,9 +78,15 @@ void DepthToSpace( ...@@ -78,9 +78,15 @@ void DepthToSpace(
MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU) MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU)
#endif #endif
MACE_BM_DEPTH_TO_SPACE(1, 4, 512, 512, 2);
MACE_BM_DEPTH_TO_SPACE(1, 8, 256, 256, 2);
MACE_BM_DEPTH_TO_SPACE(1, 12, 512, 512, 2);
MACE_BM_DEPTH_TO_SPACE(1, 9, 512, 512, 3);
MACE_BM_DEPTH_TO_SPACE(1, 18, 256, 256, 3);
MACE_BM_DEPTH_TO_SPACE(1, 27, 512, 512, 3);
MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); MACE_BM_DEPTH_TO_SPACE(1, 128, 128, 128, 4);
MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); MACE_BM_DEPTH_TO_SPACE(1, 256, 256, 256, 4);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -29,7 +29,7 @@ void SpaceToDepth( ...@@ -29,7 +29,7 @@ void SpaceToDepth(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, height, channels, width}); net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else { } else {
...@@ -78,6 +78,10 @@ void SpaceToDepth( ...@@ -78,6 +78,10 @@ void SpaceToDepth(
MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU) MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU)
#endif #endif
MACE_BM_SPACE_TO_DEPTH(1, 1, 513, 513, 3);
MACE_BM_SPACE_TO_DEPTH(1, 2, 256, 256, 2);
MACE_BM_SPACE_TO_DEPTH(1, 3, 512, 512, 2);
MACE_BM_SPACE_TO_DEPTH(1, 3, 513, 513, 3);
MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4);
......
...@@ -61,17 +61,103 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape, ...@@ -61,17 +61,103 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
class DepthToSpaceOpTest : public OpsTestBase {}; class DepthToSpaceOpTest : public OpsTestBase {};
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, CPUInputDepthLess4) {
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 2, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17},
3,
{1, 3, 6, 1},
{0, 1, 2, 9, 10, 11,
3, 4, 5, 12, 13, 14,
6, 7, 8, 15, 16, 17});
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 2, 18},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
3,
{1, 3, 6, 2},
{0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35});
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 2, 12},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
2,
{1, 2, 4, 3},
{0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23});
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 1, 27},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26},
3,
{1, 3, 3, 3},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26});
}
TEST_F(DepthToSpaceOpTest, CPUInputDepth4) {
RunDepthToSpace<DeviceType::CPU>( RunDepthToSpace<DeviceType::CPU>(
{1, 1, 2, 16}, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 2, 4, 4}, 2, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(DepthToSpaceOpTest, OPENCLInputDepth1) {
RunDepthToSpace<DeviceType::GPU>(
{1, 1, 2, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17},
3,
{1, 3, 6, 1},
{0, 1, 2, 9, 10, 11,
3, 4, 5, 12, 13, 14,
6, 7, 8, 15, 16, 17});
}
TEST_F(DepthToSpaceOpTest, OPENCLInputDepth2) {
RunDepthToSpace<DeviceType::GPU>(
{1, 1, 2, 18},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
3,
{1, 3, 6, 2},
{0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35});
}
TEST_F(DepthToSpaceOpTest, OPENCLInputDepth3) {
RunDepthToSpace<DeviceType::GPU>(
{1, 1, 2, 12},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
2,
{1, 2, 4, 3},
{0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23});
RunDepthToSpace<DeviceType::GPU>(
{1, 1, 1, 27},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26},
3,
{1, 3, 3, 3},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26});
} }
TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, OPENCLInputDepth4) {
RunDepthToSpace<DeviceType::GPU>( RunDepthToSpace<DeviceType::GPU>(
{1, 1, 2, 16}, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
...@@ -79,29 +165,12 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { ...@@ -79,29 +165,12 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
2, {1, 2, 4, 4}, 2, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
RunDepthToSpace<DeviceType::CPU>(
{1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
}
TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
RunDepthToSpace<DeviceType::GPU>( RunDepthToSpace<DeviceType::GPU>(
{1, 1, 1, 16}, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
RunDepthToSpace<DeviceType::GPU>({1, 192, 192, 128}, in, 2,
{1, 384, 384, 32}, in);
}
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const int block_size, void RandomTest(const int block_size,
...@@ -149,14 +218,51 @@ void RandomTest(const int block_size, ...@@ -149,14 +218,51 @@ void RandomTest(const int block_size,
} }
} // namespace } // namespace
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth1) {
RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 4});
RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 9});
RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 25});
RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 49});
}
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth2) {
RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 8});
RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 18});
RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 50});
RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 98});
}
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth3) {
RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 12});
RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 27});
RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 75});
RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 147});
}
TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 16});
RandomTest<DeviceType::GPU, float>(3, {1, 222, 222, 144});
RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 200});
RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 196});
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 128}); RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 4});
RandomTest<DeviceType::GPU, half>(3, {1, 111, 111, 18});
RandomTest<DeviceType::GPU, half>(5, {1, 20, 20, 75});
RandomTest<DeviceType::GPU, half>(7, {1, 14, 14, 147});
RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 8});
} }
TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) {
RandomTest<DeviceType::GPU, half>(2, {2, 192, 192, 4});
RandomTest<DeviceType::GPU, half>(3, {3, 111, 111, 18});
RandomTest<DeviceType::GPU, half>(5, {2, 20, 20, 75});
RandomTest<DeviceType::GPU, half>(7, {3, 14, 14, 147});
RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 8});
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -61,38 +61,106 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape, ...@@ -61,38 +61,106 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
class SpaceToDepthOpTest : public OpsTestBase {}; class SpaceToDepthOpTest : public OpsTestBase {};
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { TEST_F(SpaceToDepthOpTest, CPUInputDepthLess4) {
RunSpaceToDepth<DeviceType::CPU>( RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 4, 4}, {1, 3, 6, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 9, 10, 11,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 3, 4, 5, 12, 13, 14,
2, {1, 1, 2, 16}, 6, 7, 8, 15, 16, 17},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); {1, 1, 2, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
RunSpaceToDepth<DeviceType::CPU>(
{1, 3, 6, 2},
{0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35},
3,
{1, 1, 2, 18},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35});
RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 4, 3},
{0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23},
2,
{1, 1, 2, 12},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
RunSpaceToDepth<DeviceType::CPU>(
{1, 3, 3, 3},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26},
3,
{1, 1, 1, 27},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26});
} }
TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, CPUInputDepth4) {
RunSpaceToDepth<DeviceType::GPU>( RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 4, 4}, {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16}, 2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
}
TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
RunSpaceToDepth<DeviceType::CPU>( RunSpaceToDepth<DeviceType::CPU>(
{1, 2, 2, 4}, {1, 2, 2, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
} }
TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { TEST_F(SpaceToDepthOpTest, OPENCLInputDepth1) {
RunSpaceToDepth<DeviceType::GPU>( RunSpaceToDepth<DeviceType::GPU>(
{1, 2, 2, 4}, {1, 3, 6, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, {0, 1, 2, 9, 10, 11,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); 3, 4, 5, 12, 13, 14,
6, 7, 8, 15, 16, 17},
3, {1, 1, 2, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
}
TEST_F(SpaceToDepthOpTest, OPENCLInputDepth2) {
RunSpaceToDepth<DeviceType::GPU>(
{1, 3, 6, 2},
{0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35},
3, {1, 1, 2, 18},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35});
}
TEST_F(SpaceToDepthOpTest, OPENCLInputDepth3) {
RunSpaceToDepth<DeviceType::GPU>(
{1, 2, 4, 3},
{0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23},
2, {1, 1, 2, 12},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
RunSpaceToDepth<DeviceType::GPU>(
{1, 3, 3, 3},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26},
3, {1, 1, 1, 27},
{0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26});
}
TEST_F(SpaceToDepthOpTest, OPENCLInputDepth4) {
RunSpaceToDepth<DeviceType::GPU>(
{1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
2, {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
} }
namespace { namespace {
...@@ -141,14 +209,50 @@ void RandomTest(const int block_size, ...@@ -141,14 +209,50 @@ void RandomTest(const int block_size,
} }
} // namespace } // namespace
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth1) {
RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 1});
RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 1});
RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 1});
RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 1});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth2) {
RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 2});
RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 2});
RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 2});
RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 2});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth3) {
RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 3});
RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 3});
RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 3});
RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 3});
}
TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 4});
RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 16});
RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 32});
RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 64});
} }
TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 1});
RandomTest<DeviceType::GPU, half>(3, {1, 333, 333, 2});
RandomTest<DeviceType::GPU, half>(5, {1, 100, 100, 3});
RandomTest<DeviceType::GPU, half>(7, {1, 98, 98, 4});
RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 32}); RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 32});
} }
TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) {
RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 1});
RandomTest<DeviceType::GPU, half>(3, {3, 333, 333, 2});
RandomTest<DeviceType::GPU, half>(5, {2, 100, 100, 3});
RandomTest<DeviceType::GPU, half>(7, {3, 98, 98, 4});
RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 32});
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册