diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 12d2fd0cd3ff1908dc0b35e32dd8ca48b6db8c5c..8dfdce0b3dcd89bfe0640a51c8b1533c42ca5b11 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -13,7 +13,7 @@ namespace kernels { template struct DepthToSpaceOpFunctor { - DepthToSpaceOpFunctor(const int block_size) : block_size_(block_size) {} + explicit DepthToSpaceOpFunctor(const int block_size) : block_size_(block_size) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -22,6 +22,13 @@ struct DepthToSpaceOpFunctor { const int input_height = input->dim(1); const int input_width = input->dim(2); const int input_depth = input->dim(3); + + std::cout << "input shape: {" << batch_size <<", "; + std::cout << input_height << ", "; + std::cout << input_width << ", "; + std::cout << input_depth << ", "; + + std::cout << "block size= " << block_size_<Resize(output_shape); - Tensor::MappingGuard logits_guard(input); - Tensor::MappingGuard output_guard(output); + // Tensor::MappingGuard logits_guard(input); + // Tensor::MappingGuard output_guard(output); const T *input_ptr = input->data(); T *output_ptr = output->mutable_data(); @@ -52,8 +64,8 @@ struct DepthToSpaceOpFunctor { for (int d = 0; d < output_depth; ++d) { const int in_d = d + offset_d; const int o_index = ((b * output_height + h) * output_width + w) * output_depth + d; - const int i_index = ((b * input_height + in_h) * input_width + in_w) * input_depth + in_d; - output_ptr[o_index] = input[i_index]; + const int i_index = ((b * input_height + in_h) * input_width + in_w) * input_depth + in_d; + output_ptr[o_index] = input_ptr[i_index]; } } } @@ -62,7 +74,12 @@ struct DepthToSpaceOpFunctor { } const int block_size_; }; - +/* +template <> +void DepthToSpaceOpFunctor::operator()(const Tensor *input, + Tensor *output, + StatsFuture *future); +*/ template struct DepthToSpaceOpFunctor { diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 2a193a23148b2b79e210adea5a967a84413d26e9..69ddfdbaa11d4e6545563f4ea0e0cb55b3f3c40d 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,23 +1,26 @@ #include -// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle(__read_only image2d_t input, - __private const int groups, - __private const int channels_per_group, +__kernel void depth_to-space(__read_only image2d_t input, + __private const int block_size, + __private const int batch_size, + __private const int input_height, + __private const int input_width, + __private const int input_depth, + __private const int output_height, + __private const int output_width, + __private const int output_depth, __write_only image2d_t output) { - const int group_chan_blk_idx = get_global_id(0); - const int width_idx = get_global_id(1); + const int ch_blk = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); const int width = get_global_size(1); - const int hb_idx = get_global_id(2); - const int group_blks = groups / 4; - const int groups_blks_width = group_blks * width; - const int channels_per_group_blks = channels_per_group / 4; - const int channels_per_group_blks_width = channels_per_group_blks * width; - DATA_TYPE4 in_chan_data0, in_chan_data1, in_chan_data2, in_chan_data3; - DATA_TYPE4 out_chan_data0, out_chan_data1, out_chan_data2, out_chan_data3; - - int in_x = mad24(group_chan_blk_idx, width, width_idx); + const int out_idx = mad24(ch_blk, width, w); + + const int d = out_idx % output_depth; + const int out_idx2 = out_idx / output_depth; + const int w = out_idx2 % output_width + for (short g_blk = 0; g_blk < group_blks; ++g_blk) { // fetch 4 groups, for each group fetch 4 channels in_chan_data0 = READ_IMAGET(input, SAMPLER, (int2)(in_x, hb_idx)); diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 7a71e507987f4879f1213487ff112f2e7406888d..e390a812b2c644a059797852a45ef75ef9859043 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -13,7 +13,7 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), DepthToSpaceOp); - +/* REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") .Device(DeviceType::OPENCL) .TypeConstraint("T") @@ -25,6 +25,7 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), DepthToSpaceOp); +*/ } } // namespace ops diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index e2cd5fb99072c7b1aacb8c75c2d1c5ec7f7a6518..808cb71591f8c1441be3dc3d24301d25b9c57a66 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -19,30 +19,31 @@ class DepthToSpaceOp : public Operator { public: DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws) : Operator(op_def, ws), - block_size_(OperatorBase::GetSingleArgument("block_size", 1)), - functor_(this->block_size_) {} + functor_(OperatorBase::GetSingleArgument("block_size", 1)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); + + const int block_size = OperatorBase::GetSingleArgument("block_size", 1); int input_depth = input->dim(3); - MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + MACE_CHECK(input_depth % (block_size * block_size) == 0, "input depth should be dividable by block_size * block_size", input->dim(3)); - + std::cout << "arg block_size: " << block_size << std::endl; functor_(input, output, future); return true; } - private: - kernels::DepthToSpaceOpFunctor functor_; - + protected: - const int block_size_; OP_INPUT_TAGS(INPUT); OP_OUTPUT_TAGS(OUTPUT); + + private: + kernels::DepthToSpaceOpFunctor functor_; }; diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index a3356b963bef6f779cbc797f8f75f9a3ad26cd8d..e33349a607c283d3e3ae781070a72519a0c4174d 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -21,7 +21,7 @@ static void DepthToSpace( net.AddRandomInput("Input", {batch, height, width, channels}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("DepthToSpace", "DepthToSpaceBM") diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 7cf3d9f33d3b720f87b3ef9daff9372ffc4f0a5c..bbbb39b5ce95e3d4fe69cfa49df7d0787ab209a0 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -17,20 +17,20 @@ TEST_F(DepthToSpaceOpTest, C8G4_CPU) { OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("Input") .Output("Output") - .AddIntArg("block_size", 1) + .AddIntArg("block_size", 2) .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 1, 2, 8}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + "Input", {1, 2, 2, 4}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); // Run net.RunOp(); // Check auto expected = CreateTensor( - {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}); + {1, 4, 4, 1}, {1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -44,7 +44,7 @@ TEST_F(DepthToSpaceOpTest, C16G4_OPENCL) { "Input", {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - BufferToImage(net, "Input", "InputImage", + BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") @@ -57,7 +57,7 @@ TEST_F(DepthToSpaceOpTest, C16G4_OPENCL) { net.RunOp(DeviceType::OPENCL); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", + ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); // Check