diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc index 5d3915b4618c5030308ab25e82df2b3c1fc0e444..d67a9c544e2346ad004497903c5d721fcf506fbb 100644 --- a/mace/core/net_def_adapter.cc +++ b/mace/core/net_def_adapter.cc @@ -164,6 +164,7 @@ MaceStatus NetDefAdapter::AdaptNetDef( input_info->set_dims(j, input_shape[j]); } } + tensor_shape_map.emplace(input_info->name(), input_shape); output_map.emplace(input_info->name(), InternalOutputInfo( mem_type, input_info->data_type(), input_data_format, input_shape, -1)); diff --git a/mace/ops/opencl/cl/depth_to_space.cl b/mace/ops/opencl/cl/depth_to_space.cl index c1dc806a231e8d632b8a60ef885dba96d24fd9dc..965489ec11dceb37034539f3d534e1c99c486fc4 100644 --- a/mace/ops/opencl/cl/depth_to_space.cl +++ b/mace/ops/opencl/cl/depth_to_space.cl @@ -1,41 +1,166 @@ #include +__kernel void depth_to_space_d1_d2(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __private const int input_height, + __private const int input_width, + __private const int block_size, + __private const int output_height, + __private const int output_width, + __private const int output_depth, + __write_only image2d_t output) { + const int in_depth_blk_idx = get_global_id(0); + const int in_width_idx = get_global_id(1); + const int in_hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (in_depth_blk_idx >= global_size_dim0 || in_width_idx >= global_size_dim1 + || in_hb_idx >= global_size_dim2) { + return; + } +#endif + const int batch_idx = in_hb_idx / input_height; + const int in_height_idx = in_hb_idx - mul24(batch_idx, input_height); + + int in_depth_idx = in_depth_blk_idx << 2; + int hw_block_size = in_depth_idx / output_depth; + int out_depth_idx = in_depth_idx - mul24(hw_block_size, output_depth); + int bottom_width_idx = mul24(in_width_idx, block_size); + int out_width_idx = bottom_width_idx + (hw_block_size % block_size); + int out_height_idx = mad24(in_height_idx, block_size, + hw_block_size / block_size); + + const int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_hb_idx)); + + int out_x_base = mul24((out_depth_idx >> 2), output_width); + int out_x = out_x_base + out_width_idx; + int out_y = mad24(batch_idx, output_height, out_height_idx); +#if defined(DEPTH1) + int top_width = mul24(in_width_idx + 1, block_size); + int top_height = mul24(in_height_idx + 1, block_size); + DATA_TYPE4 t_out_data = 0; + int t_out_x = out_x; + int t_width_idx = out_width_idx; + int t_height_idx = out_height_idx; + DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data); + for (int i = 0; i < 4; ++i) { + t_out_data.x = in_data_ptr[i]; + WRITE_IMAGET(output, (int2)(t_out_x, out_y), t_out_data); + if (t_width_idx + 1 >= top_width) { + if (t_height_idx + 1 >= top_height) { + break; + } + t_width_idx = bottom_width_idx; + t_out_x = out_x_base + t_width_idx; + t_height_idx += 1; + out_y += 1; + } else { + t_width_idx += 1; + t_out_x += 1; + } + } +#elif defined(DEPTH2) + int top_width = mul24(in_width_idx + 1, block_size); + int top_height = mul24(in_height_idx + 1, block_size); + DATA_TYPE4 t_out_data = 0; + t_out_data.x = in_data.x; + t_out_data.y = in_data.y; + WRITE_IMAGET(output, (int2)(out_x, out_y), t_out_data); + t_out_data.x = in_data.z; + t_out_data.y = in_data.w; + if (out_width_idx + 1 >= top_width) { + if (out_height_idx + 1 < top_height) { + int t_out_x = out_x_base + bottom_width_idx; + WRITE_IMAGET(output, (int2)(t_out_x, out_y + 1), t_out_data); + } + } else { + WRITE_IMAGET(output, (int2)(out_x + 1, out_y), t_out_data); + } +#endif +} + __kernel void depth_to_space(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, - __private const int block_size, - __private const int input_hb, + __private const int input_height, __private const int input_width, - __private const int input_depth_blocks, + __private const int block_size, + __private const int output_height, __private const int output_width, - __private const int output_depth_blocks, + __private const int output_depth, __write_only image2d_t output) { - const int out_d = get_global_id(0); - const int out_w = get_global_id(1); - const int out_hb = get_global_id(2); + const int out_depth_blk_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_hb_idx = get_global_id(2); #ifndef NON_UNIFORM_WORK_GROUP - if (out_d >= global_size_dim0 || out_w >= global_size_dim1 - || out_hb >= global_size_dim2) { + if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 + || out_hb_idx >= global_size_dim2) { return; } #endif + const int batch_idx = out_hb_idx / output_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height); - const int out_pos = mad24(out_d, output_width, out_w); + int in_height_idx = out_height_idx / block_size; + int height_idx_in_blk = out_height_idx - mul24(in_height_idx, block_size); + int in_width_idx = out_width_idx / block_size; + int width_idx_in_blk = out_width_idx - mul24(in_width_idx, block_size); + int in_depth_idx = mad24( + mad24(height_idx_in_blk, block_size, width_idx_in_blk), + output_depth, out_depth_blk_idx << 2); - const int in_hb = out_hb / block_size; - const int offset_h = out_hb - mul24(in_hb, block_size); - const int in_w = out_w / block_size; - const int offset_w = out_w - mul24(in_w, block_size); - const int offset_d = mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks); - const int in_d = out_d + offset_d; + int in_depth_blk_idx = in_depth_idx >> 2; + int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx); + int in_y = mad24(batch_idx, input_height, in_height_idx); + DATA_TYPE4 out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y)); - if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) { - return; +#ifdef DEPTH3 + DATA_TYPE4 t_out_data = out_data; + int left_part_size = 4 - (in_depth_idx & 0x3); + switch(left_part_size) { + case 1: + out_data.x = t_out_data.w; + break; + case 2: + out_data.x = t_out_data.z; + out_data.y = t_out_data.w; + break; + case 3: + out_data.x = t_out_data.y; + out_data.y = t_out_data.z; + out_data.z = t_out_data.w; + break; + case 4: + out_data.x = t_out_data.x; + out_data.y = t_out_data.y; + out_data.z = t_out_data.z; + break; + default: + out_data = 0; } + int right_part_size = 3 - left_part_size; + if (right_part_size > 0) { + int in_depth_blks = mul24(mul24(block_size, block_size), 3) >> 2; + in_x = select(-1, in_x + input_width, in_depth_blk_idx + 1 < in_depth_blks); + t_out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y)); + switch (right_part_size) { + case 2: + out_data.y = t_out_data.x; + out_data.z = t_out_data.y; + break; + case 1: + out_data.z = t_out_data.x; + break; + default: + out_data = 0; + } + } + out_data.w = 0; +#endif - const int in_pos = mad24(in_d, input_width, in_w); - DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb)); - - WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); + int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx); + WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), out_data); } diff --git a/mace/ops/opencl/cl/space_to_depth.cl b/mace/ops/opencl/cl/space_to_depth.cl index 6dc821d7b28b3f6ae6c2d60206ebc69a6cf8ca79..4d1d6ae74b2e19df6d479044ab0ab1916e504237 100644 --- a/mace/ops/opencl/cl/space_to_depth.cl +++ b/mace/ops/opencl/cl/space_to_depth.cl @@ -3,39 +3,130 @@ __kernel void space_to_depth(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, - __private const int block_size, + __private const int input_height, __private const int input_width, - __private const int input_depth_blocks, - __private const int output_hb, + __private const int input_depth, + __private const int block_size, + __private const int output_height, __private const int output_width, - __private const int output_depth_blocks, __write_only image2d_t output) { - const int d = get_global_id(0); - const int w = get_global_id(1); - const int hb = get_global_id(2); + const int out_depth_blk_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_hb_idx = get_global_id(2); #ifndef NON_UNIFORM_WORK_GROUP - if (d >= global_size_dim0 || w >= global_size_dim1 - || hb >= global_size_dim2) { + if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 + || out_hb_idx >= global_size_dim2) { return; } #endif - const int in_pos = mad24(d, input_width, w); + const int batch_idx = out_hb_idx / output_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height); - const int out_hb = hb / block_size; - const int offset_h = hb - mul24(out_hb, block_size); - const int out_w = w / block_size; - const int offset_w = w - mul24(out_w, block_size); - const int offset_d = mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w)); - const int out_d = d + offset_d; + int out_depth_idx = out_depth_blk_idx << 2; + int in_depth_idx = out_depth_idx % input_depth; + int hw_block_size = out_depth_idx / input_depth; + int bottom_width_idx = mul24(out_width_idx, block_size); + int in_width_idx = bottom_width_idx + (hw_block_size % block_size); + int in_height_idx = mad24(out_height_idx, block_size, + hw_block_size / block_size); - if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { - return; + DATA_TYPE4 in_data = 0; + int in_x = mad24((in_depth_idx >> 2), input_width, in_width_idx); + int in_y = mad24(batch_idx, input_height, in_height_idx); +#if defined(DEPTH1) + int top_width = mul24(out_width_idx + 1, block_size); + int top_height = mul24(out_height_idx + 1, block_size); + DATA_TYPE4 t_in_data = 0; + int t_in_x = in_x; + int t_width_idx = in_width_idx; + int t_height_idx = in_height_idx; + DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data); + for (int i = 0; i < 4; ++i) { + t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y)); + in_data_ptr[i] = t_in_data.x; + if (t_width_idx + 1 >= top_width) { + if (t_height_idx + 1 >= top_height) { + break; + } + t_width_idx = bottom_width_idx; + t_in_x = mad24((in_depth_idx >> 2), input_width, t_width_idx); + t_height_idx += 1; + in_y += 1; + } else { + t_width_idx += 1; + t_in_x += 1; + } } +#elif defined(DEPTH2) + int top_width = mul24(out_width_idx + 1, block_size); + int top_height = mul24(out_height_idx + 1, block_size); + DATA_TYPE4 t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y)); + in_data.x = t_in_data.x; + in_data.y = t_in_data.y; + t_in_data = 0; + if (in_width_idx + 1 >= top_width) { + if (in_height_idx + 1 < top_height) { + int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx); + t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1)); + } + } else { + t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y)); + } + in_data.z = t_in_data.x; + in_data.w = t_in_data.y; +#elif defined(DEPTH3) + int top_width = mul24(out_width_idx + 1, block_size); + int top_height = mul24(out_height_idx + 1, block_size); + DATA_TYPE4 in_data0 = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y)); + DATA_TYPE4 in_data1 = 0; + if (in_width_idx + 1 >= top_width) { + if (in_height_idx + 1 < top_height) { + int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx); + in_data1 = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1)); + } + } else { + in_data1 = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y)); + } + int left_part_size = 3 - in_depth_idx; + int right_part_size = 4 - left_part_size; + switch(left_part_size) { + case 3: + in_data.z = in_data0.z; + in_data.y = in_data0.y; + in_data.x = in_data0.x; + break; + case 2: + in_data.y = in_data0.z; + in_data.x = in_data0.y; + break; + case 1: + in_data.x = in_data0.z; + break; + default: + in_data = 0; + } + switch(right_part_size) { + case 3: + in_data.y = in_data1.x; + in_data.z = in_data1.y; + in_data.w = in_data1.z; + break; + case 2: + in_data.z = in_data1.x; + in_data.w = in_data1.y; + break; + case 1: + in_data.w = in_data1.x; + break; + default: + in_data = 0; + } +#else + in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y)); +#endif - const int out_pos = mad24(out_d, output_width, out_w); - DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb)); - - WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); + const int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx); + WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), in_data); } diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h index b05636087ed181d30fe28653b8bc2bd76ee57e04..990e06ccef6771b2d7ab8a4e8bb31446e7feeb40 100644 --- a/mace/ops/opencl/image/depth_to_space.h +++ b/mace/ops/opencl/image/depth_to_space.h @@ -58,19 +58,14 @@ MaceStatus DepthToSpaceKernel::Compute( const index_t input_depth = input->dim(3); MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, - "input depth should be dividable by block_size * block_size", + "input depth should be dividable by block_size * block_size ", input_depth); - MACE_CHECK((input_depth % 4) == 0, - "input channel should be dividable by 4"); const index_t output_height = input_height * block_size_; const index_t output_width = input_width * block_size_; const index_t output_depth = input_depth / (block_size_ * block_size_); - MACE_CHECK(output_depth % 4 == 0, "output channel not support:") - << output_depth; - - const index_t input_depth_blocks = RoundUpDiv4(input_depth); - const index_t output_depth_blocks = RoundUpDiv4(output_depth); + MACE_CHECK(output_depth % 4 == 0 || output_depth < 4, + "output channel not support:") << output_depth; std::vector output_shape = {batch, output_height, @@ -82,11 +77,16 @@ MaceStatus DepthToSpaceKernel::Compute( &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - const uint32_t gws[3] = { - static_cast(RoundUpDiv4(output_depth)), - static_cast(output_width), - static_cast(output_height * batch) - }; + uint32_t gws[3]; + if (output_depth < 3) { + gws[0] = static_cast(RoundUpDiv4(input_depth)); + gws[1] = static_cast(input_width); + gws[2] = static_cast(input_height * batch); + } else { + gws[0] = static_cast(RoundUpDiv4(output_depth)); + gws[1] = static_cast(output_width); + gws[2] = static_cast(output_height * batch); + } auto runtime = context->device()->gpu_runtime()->opencl_runtime(); MACE_OUT_OF_RANGE_DEFINITION; @@ -95,6 +95,10 @@ MaceStatus DepthToSpaceKernel::Compute( MACE_OUT_OF_RANGE_CONFIG; MACE_NON_UNIFORM_WG_CONFIG; const char *kernel_name = "depth_to_space"; + if (output_depth < 4) { + built_options.emplace(MakeString("-DDEPTH", output_depth)); + if (output_depth != 3) kernel_name = "depth_to_space_d1_d2"; + } std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -116,20 +120,20 @@ MaceStatus DepthToSpaceKernel::Compute( MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_height * batch)); + kernel_.setArg(idx++, static_cast(input_height)); kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(output_height)); kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_depth)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } - std::string tuning_key = Concat("depth_to_space_opencl_kernel", - batch, output_height, - output_width, output_depth); + std::string tuning_key = Concat("depth_to_space", + batch, output_height, + output_width, output_depth); const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, context->future())); diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h index 514bd759e7916ad7f3f706c204653fb7800903e3..e58b7b8d0660cc6c91d965557a17cb1c206f072e 100644 --- a/mace/ops/opencl/image/space_to_depth.h +++ b/mace/ops/opencl/image/space_to_depth.h @@ -57,7 +57,7 @@ MaceStatus SpaceToDepthKernel::Compute( const index_t input_width = input->dim(2); const index_t input_depth = input->dim(3); - MACE_CHECK((input_depth % 4) == 0, + MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0, "input channel should be dividable by 4"); MACE_CHECK( (input_width % block_size_ == 0) && (input_height % block_size_ == 0), @@ -67,7 +67,6 @@ MaceStatus SpaceToDepthKernel::Compute( const index_t output_width = input_width / block_size_; const index_t output_depth = input_depth * block_size_ * block_size_; - const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth); std::vector output_shape = {batch, output_height, output_width, @@ -90,6 +89,9 @@ MaceStatus SpaceToDepthKernel::Compute( std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + if (input_depth < 4) { + built_options.emplace(MakeString("-DDEPTH", input_depth)); + } built_options.emplace(kernel_name_ss.str()); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); @@ -102,28 +104,28 @@ MaceStatus SpaceToDepthKernel::Compute( static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t gws[3] = {static_cast(input_depth_blocks), - static_cast(input_width), - static_cast(input_height * batch)}; + const uint32_t gws[3] = {static_cast(output_depth_blocks), + static_cast(output_width), + static_cast(output_height * batch)}; MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_height)); kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_height * batch)); + kernel_.setArg(idx++, static_cast(input_depth)); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(output_height)); kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), + std::string tuning_key = Concat("space_to_depth", input->dim(0), input->dim(1), input->dim(2), input->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, context->future())); diff --git a/mace/tools/benchmark/BUILD.bazel b/mace/tools/benchmark/BUILD.bazel index 425d256697a5c4108c6626e17599d86c6055e23f..b1528e62b2589a5b282646040b56feeea4d8fe0c 100644 --- a/mace/tools/benchmark/BUILD.bazel +++ b/mace/tools/benchmark/BUILD.bazel @@ -28,10 +28,7 @@ cc_binary( "//mace/codegen:generated_models", "//mace/libmace", "//mace/utils", - ] + if_opencl_enabled([ - "//mace/codegen:generated_opencl_binary", - "//mace/codegen:generated_opencl_parameter", - ]), + ], ) cc_binary( @@ -60,10 +57,7 @@ cc_binary( "//mace/codegen:generated_mace_engine_factory", "//mace/codegen:generated_models", "//mace/libmace:libmace_dynamic", - ] + if_opencl_enabled([ - "//mace/codegen:generated_opencl_binary", - "//mace/codegen:generated_opencl_parameter", - ]), + ], ) cc_library( diff --git a/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc b/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc index 825fda2d44f381b3574e84a8462b5d64268615b2..dbc8f84899db2d12fd5eb95568b72a944a3ae937 100644 --- a/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc +++ b/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc @@ -78,9 +78,15 @@ void DepthToSpace( MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU) #endif +MACE_BM_DEPTH_TO_SPACE(1, 4, 512, 512, 2); +MACE_BM_DEPTH_TO_SPACE(1, 8, 256, 256, 2); +MACE_BM_DEPTH_TO_SPACE(1, 12, 512, 512, 2); +MACE_BM_DEPTH_TO_SPACE(1, 9, 512, 512, 3); +MACE_BM_DEPTH_TO_SPACE(1, 18, 256, 256, 3); +MACE_BM_DEPTH_TO_SPACE(1, 27, 512, 512, 3); MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4); -MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4); -MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4); +MACE_BM_DEPTH_TO_SPACE(1, 128, 128, 128, 4); +MACE_BM_DEPTH_TO_SPACE(1, 256, 256, 256, 4); } // namespace test } // namespace ops diff --git a/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc b/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc index e49214e2a5e01292a7944de4ed8a26a40ca73f88..5a8e6c9dc9724944e5ec9f0b34bbcefa0ed09201 100644 --- a/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc +++ b/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc @@ -29,7 +29,7 @@ void SpaceToDepth( // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, height, channels, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { @@ -78,6 +78,10 @@ void SpaceToDepth( MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU) #endif +MACE_BM_SPACE_TO_DEPTH(1, 1, 513, 513, 3); +MACE_BM_SPACE_TO_DEPTH(1, 2, 256, 256, 2); +MACE_BM_SPACE_TO_DEPTH(1, 3, 512, 512, 2); +MACE_BM_SPACE_TO_DEPTH(1, 3, 513, 513, 3); MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4); MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4); diff --git a/test/ccunit/mace/ops/depth_to_space_test.cc b/test/ccunit/mace/ops/depth_to_space_test.cc index 65fb7d39e3f3ace225db18969648e64959a71455..3bf32efa3c849f3c25a872fe1c989c18c872d037 100644 --- a/test/ccunit/mace/ops/depth_to_space_test.cc +++ b/test/ccunit/mace/ops/depth_to_space_test.cc @@ -61,17 +61,103 @@ void RunDepthToSpace(const std::vector &input_shape, class DepthToSpaceOpTest : public OpsTestBase {}; -TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { +TEST_F(DepthToSpaceOpTest, CPUInputDepthLess4) { + RunDepthToSpace( + {1, 1, 2, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, + 3, + {1, 3, 6, 1}, + {0, 1, 2, 9, 10, 11, + 3, 4, 5, 12, 13, 14, + 6, 7, 8, 15, 16, 17}); + RunDepthToSpace( + {1, 1, 2, 18}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}, + 3, + {1, 3, 6, 2}, + {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23, + 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, + 12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35}); + RunDepthToSpace( + {1, 1, 2, 12}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + 2, + {1, 2, 4, 3}, + {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, + 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23}); + RunDepthToSpace( + {1, 1, 1, 27}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}, + 3, + {1, 3, 3, 3}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}); +} + +TEST_F(DepthToSpaceOpTest, CPUInputDepth4) { RunDepthToSpace( {1, 1, 2, 16}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 2, {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, + {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); + RunDepthToSpace( + {1, 1, 1, 16}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); +} + +TEST_F(DepthToSpaceOpTest, OPENCLInputDepth1) { + RunDepthToSpace( + {1, 1, 2, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, + 3, + {1, 3, 6, 1}, + {0, 1, 2, 9, 10, 11, + 3, 4, 5, 12, 13, 14, + 6, 7, 8, 15, 16, 17}); +} + +TEST_F(DepthToSpaceOpTest, OPENCLInputDepth2) { + RunDepthToSpace( + {1, 1, 2, 18}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}, + 3, + {1, 3, 6, 2}, + {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23, + 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, + 12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35}); +} + +TEST_F(DepthToSpaceOpTest, OPENCLInputDepth3) { + RunDepthToSpace( + {1, 1, 2, 12}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + 2, + {1, 2, 4, 3}, + {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, + 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23}); + RunDepthToSpace( + {1, 1, 1, 27}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}, + 3, + {1, 3, 3, 3}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}); } -TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { +TEST_F(DepthToSpaceOpTest, OPENCLInputDepth4) { RunDepthToSpace( {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -79,29 +165,12 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { 2, {1, 2, 4, 4}, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}); -} - -TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { - RunDepthToSpace( - {1, 1, 1, 16}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); -} - -TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { RunDepthToSpace( {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); } -TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { - const std::vector in = std::vector(192 * 192 * 128, 1.0); - - RunDepthToSpace({1, 192, 192, 128}, in, 2, - {1, 384, 384, 32}, in); -} - namespace { template void RandomTest(const int block_size, @@ -149,14 +218,51 @@ void RandomTest(const int block_size, } } // namespace +TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth1) { + RandomTest(2, {1, 192, 192, 4}); + RandomTest(3, {1, 111, 111, 9}); + RandomTest(5, {1, 20, 20, 25}); + RandomTest(7, {1, 14, 14, 49}); +} + +TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth2) { + RandomTest(2, {1, 192, 192, 8}); + RandomTest(3, {1, 111, 111, 18}); + RandomTest(5, {1, 20, 20, 50}); + RandomTest(7, {1, 14, 14, 98}); +} + +TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth3) { + RandomTest(2, {1, 192, 192, 12}); + RandomTest(3, {1, 111, 111, 27}); + RandomTest(5, {1, 20, 20, 75}); + RandomTest(7, {1, 14, 14, 147}); +} + TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { - RandomTest(2, {1, 192, 192, 128}); + RandomTest(2, {1, 192, 192, 16}); + RandomTest(3, {1, 222, 222, 144}); + RandomTest(5, {1, 100, 100, 200}); + RandomTest(7, {1, 98, 98, 196}); } TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { - RandomTest(2, {1, 192, 192, 128}); + RandomTest(2, {1, 192, 192, 4}); + RandomTest(3, {1, 111, 111, 18}); + RandomTest(5, {1, 20, 20, 75}); + RandomTest(7, {1, 14, 14, 147}); + RandomTest(2, {1, 384, 384, 8}); } +TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) { + RandomTest(2, {2, 192, 192, 4}); + RandomTest(3, {3, 111, 111, 18}); + RandomTest(5, {2, 20, 20, 75}); + RandomTest(7, {3, 14, 14, 147}); + RandomTest(2, {2, 384, 384, 8}); +} + + } // namespace test } // namespace ops } // namespace mace diff --git a/test/ccunit/mace/ops/space_to_depth_test.cc b/test/ccunit/mace/ops/space_to_depth_test.cc index 6d023b88c9873d5e0d9b63cf54eebf1695594209..226083b71344ffdbe22266b30e53f333cfc2d8fc 100644 --- a/test/ccunit/mace/ops/space_to_depth_test.cc +++ b/test/ccunit/mace/ops/space_to_depth_test.cc @@ -61,38 +61,106 @@ void RunSpaceToDepth(const std::vector &input_shape, class SpaceToDepthOpTest : public OpsTestBase {}; -TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { +TEST_F(SpaceToDepthOpTest, CPUInputDepthLess4) { RunSpaceToDepth( - {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, - 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, - 2, {1, 1, 2, 16}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + {1, 3, 6, 1}, + {0, 1, 2, 9, 10, 11, + 3, 4, 5, 12, 13, 14, + 6, 7, 8, 15, 16, 17}, + 3, + {1, 1, 2, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); + RunSpaceToDepth( + {1, 3, 6, 2}, + {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23, + 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, + 12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35}, + 3, + {1, 1, 2, 18}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}); + RunSpaceToDepth( + {1, 2, 4, 3}, + {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, + 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23}, + 2, + {1, 1, 2, 12}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); + RunSpaceToDepth( + {1, 3, 3, 3}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}, + 3, + {1, 1, 1, 27}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}); } -TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { - RunSpaceToDepth( +TEST_F(SpaceToDepthOpTest, CPUInputDepth4) { + RunSpaceToDepth( {1, 2, 4, 4}, {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, 2, {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); -} - -TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { RunSpaceToDepth( {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); } -TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { +TEST_F(SpaceToDepthOpTest, OPENCLInputDepth1) { RunSpaceToDepth( - {1, 2, 2, 4}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + {1, 3, 6, 1}, + {0, 1, 2, 9, 10, 11, + 3, 4, 5, 12, 13, 14, + 6, 7, 8, 15, 16, 17}, + 3, {1, 1, 2, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLInputDepth2) { + RunSpaceToDepth( + {1, 3, 6, 2}, + {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23, + 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, + 12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35}, + 3, {1, 1, 2, 18}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLInputDepth3) { + RunSpaceToDepth( + {1, 2, 4, 3}, + {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, + 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23}, + 2, {1, 1, 2, 12}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); + RunSpaceToDepth( + {1, 3, 3, 3}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}, + 3, {1, 1, 1, 27}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLInputDepth4) { + RunSpaceToDepth( + {1, 2, 4, 4}, + {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, + 2, {1, 1, 2, 16}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); } namespace { @@ -141,14 +209,50 @@ void RandomTest(const int block_size, } } // namespace +TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth1) { + RandomTest(2, {1, 384, 384, 1}); + RandomTest(3, {1, 333, 333, 1}); + RandomTest(5, {1, 100, 100, 1}); + RandomTest(7, {1, 98, 98, 1}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth2) { + RandomTest(2, {1, 384, 384, 2}); + RandomTest(3, {1, 333, 333, 2}); + RandomTest(5, {1, 100, 100, 2}); + RandomTest(7, {1, 98, 98, 2}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth3) { + RandomTest(2, {1, 384, 384, 3}); + RandomTest(3, {1, 333, 333, 3}); + RandomTest(5, {1, 100, 100, 3}); + RandomTest(7, {1, 98, 98, 3}); +} + TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { - RandomTest(2, {1, 384, 384, 32}); + RandomTest(2, {1, 384, 384, 4}); + RandomTest(3, {1, 333, 333, 16}); + RandomTest(5, {1, 100, 100, 32}); + RandomTest(7, {1, 98, 98, 64}); } TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { + RandomTest(2, {1, 384, 384, 1}); + RandomTest(3, {1, 333, 333, 2}); + RandomTest(5, {1, 100, 100, 3}); + RandomTest(7, {1, 98, 98, 4}); RandomTest(2, {1, 384, 384, 32}); } +TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) { + RandomTest(2, {2, 384, 384, 1}); + RandomTest(3, {3, 333, 333, 2}); + RandomTest(5, {2, 100, 100, 3}); + RandomTest(7, {3, 98, 98, 4}); + RandomTest(2, {2, 384, 384, 32}); +} + } // namespace test } // namespace ops } // namespace mace