diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 488b291d6df1061f95cccd0f89a492046eb4aa08..dc86cffdb325c0308b63fd201837f32e6d11a40d 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -147,6 +147,7 @@ const std::map {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, {"pooling", "pooling.cl"}, {"relu", "relu.cl"}, + {"concat", "concat.cl"}, {"resize_bilinear", "resize_bilinear.cl"}, {"space_to_batch", "space_to_batch.cl"}, {"buffer_to_image", "buffer_to_image.cl"}, diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 807fda0aa8c1ae76538529c61afdd7c7c01d3827..e70b4e73c977b9d8da0735784219739c5dbd468a 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -8,25 +8,64 @@ #include "mace/core/common.h" #include "mace/core/types.h" #include "mace/proto/mace.pb.h" +#include "mace/core/tensor.h" + namespace mace { namespace kernels { -template -struct ConcatFunctor { - void operator()(std::vector &input_list, - const index_t inner_dim, - const index_t *outer_dims, - T *output) { - const size_t input_count = input_list.size(); - for (int inner_idx = 0; inner_idx < inner_dim; ++inner_idx) { - for (size_t i = 0; i < input_count; ++i) { +struct ConcatFunctorBase { + ConcatFunctorBase(const int32_t axis): axis_(axis){} + + int32_t axis_; +}; + +template +struct ConcatFunctor : ConcatFunctorBase { + ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} + + void operator()(const std::vector &input_list, + Tensor *output) { + const Tensor *input0 = input_list.front(); + const int inputs_count = input_list.size(); + + std::vector output_shape(input0->shape()); + index_t inner_size = 1; + for (int i = 0; i < axis_; ++i) { + inner_size *= output_shape[i]; + } + std::vector outer_sizes(inputs_count, 0); + outer_sizes[0] = input0->size() / inner_size; + for (int i = 1; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + MACE_CHECK(input->dim_size() == input0->dim_size(), + "Ranks of all input tensors must be same."); + for (int j = 0; j < input->dim_size(); ++j) { + if (j == axis_) { + continue; + } + MACE_CHECK(input->dim(j) == input0->dim(j), + "Dimensions of inputs should equal except axis."); + } + outer_sizes[i] = input->size() / inner_size; + output_shape[axis_] += input->dim(axis_); + } + output->Resize(output_shape); + + T *output_ptr = output->mutable_data(); + + std::vector input_ptrs(input_list.size(), nullptr); + for (size_t i = 0; i < inputs_count; ++i) { + input_ptrs[i] = input_list[i]->data(); + } + for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) { + for (size_t i = 0; i < inputs_count; ++i) { if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { - memcpy(output, input_list[i], outer_dims[i] * sizeof(T)); - output += outer_dims[i]; - input_list[i] += outer_dims[i]; + memcpy(output_ptr, input_ptrs[i], outer_sizes[i] * sizeof(T)); + output_ptr += outer_sizes[i]; + input_ptrs[i] += outer_sizes[i]; } else { - for (index_t k = 0; k < outer_dims[i]; ++k) { - *output++ = *input_list[i]++; + for (index_t k = 0; k < outer_sizes[i]; ++k) { + *output_ptr++ = *input_ptrs[i]++; } } } @@ -34,6 +73,15 @@ struct ConcatFunctor { } }; +template +struct ConcatFunctor : ConcatFunctorBase{ + ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} + + void operator()(const std::vector &input_list, + Tensor *output); + +}; + } // namepsace kernels } // namespace mace diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl new file mode 100644 index 0000000000000000000000000000000000000000..5ae4fb04e141f6510529083b2fee9e8826494b6d --- /dev/null +++ b/mace/kernels/opencl/cl/concat.cl @@ -0,0 +1,98 @@ +#include + +DATA_TYPE4 stitch_vector(DATA_TYPE4 left, + DATA_TYPE4 right, + const int pos, + const bool reversed) { + if (!reversed) { + switch (pos) { + case 1:return (DATA_TYPE4)(left.x, right.x, right.y, right.z); + case 2:return (DATA_TYPE4)(left.x, left.y, right.x, right.y); + case 3:return (DATA_TYPE4)(left.x, left.y, left.z, right.x); + default:return (DATA_TYPE4) 0; + } + } else { + switch (pos) { + case 1:return (DATA_TYPE4)(left.w, right.x, right.y, right.z); + case 2:return (DATA_TYPE4)(left.z, left.w, right.x, right.y); + case 3:return (DATA_TYPE4)(left.y, left.z, left.w, right.x); + default:return (DATA_TYPE4) 0; + } + } +} + +// Supported data type: half/float +__kernel void concat_channel(__read_only image2d_t input0, + __read_only image2d_t input1, + __private const int input0_chan, + __write_only image2d_t output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int width = get_global_size(1); + const int hb_idx = get_global_id(2); + const int input0_chan_blk = (input0_chan + 3) / 4; + + DATA_TYPE4 data = 0; +#ifdef DIVISIBLE_FOUR + if (chan_blk_idx + 1 <= input0_chan_blk) { + data = READ_IMAGET(input0, + SAMPLER, + (int2)(chan_blk_idx * width + width_idx, hb_idx)); + } else { + data = READ_IMAGET(input1, + SAMPLER, + (int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx)); + } +#else + if (chan_blk_idx + 1 < input0_chan_blk) { + data = READ_IMAGET(input0, + SAMPLER, + (int2)(chan_blk_idx * width + width_idx, hb_idx)); + } else if (chan_blk_idx >= input0_chan_blk) { + const int in_chan_idx = chan_blk_idx - input0_chan_blk; + DATA_TYPE4 data0 = READ_IMAGET(input1, + SAMPLER, + (int2)(in_chan_idx * width + width_idx, hb_idx)); + DATA_TYPE4 data1 = READ_IMAGET(input1, + SAMPLER, + (int2)((in_chan_idx + 1) * width + width_idx, hb_idx)); + data = stitch_vector(data0, data1, input0_chan % 4, true); + } else { + DATA_TYPE4 data0 = READ_IMAGET(input0, + SAMPLER, + (int2)(chan_blk_idx * width + width_idx, hb_idx)); + DATA_TYPE4 data1 = READ_IMAGET(input1, + SAMPLER, + (int2)(width_idx, hb_idx)); + data = stitch_vector(data0, data1, input0_chan % 4, false); + } +#endif + + WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data); +} + +//__kernel void concat_width(__read_only image2d_t input0, +// __read_only image2d_t input1, +// __private const int input0_width, +// __write_only image2d_t output) { +// const int chan_blk_idx = get_global_id(0); +// const int width_idx = get_global_id(1); +// const int width = get_global_size(1); +// const int hb_idx = get_global_id(2); +// +// const sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +// +// DATA_TYPE4 data = 0; +// if (width_idx < input0_width) { +// data = READ_IMAGET(input0, +// SAMPLER, +// (int2)(chan_blk_idx * width + width_idx, hb_idx)); +// } else { +// data = READ_IMAGET(input1, +// SAMPLER, +// (int2)(chan_blk_idx * width + (width_idx - input0_width), hb_idx)); +// } +// +// WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data); +//} + diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc new file mode 100644 index 0000000000000000000000000000000000000000..80a23a44ccdbb50cd2be9976f2dfecf8d0576a91 --- /dev/null +++ b/mace/kernels/opencl/concat.cc @@ -0,0 +1,102 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/concat.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +static void Concat2(const Tensor *input0, + const Tensor *input1, + const DataType dt, + Tensor *output) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const int channel_blk = RoundUpDiv4(channel); + + auto runtime = OpenCLRuntime::Get(); + std::set built_options; + if (input0->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (input0->dim(3) % 4 == 0) { + built_options.emplace("-DDIVISIBLE_FOUR"); + } + auto concat_kernel = runtime->BuildKernel("concat", "concat_channel", built_options); + + uint32_t idx = 0; + concat_kernel.setArg(idx++, *(static_cast(input0->buffer()))); + concat_kernel.setArg(idx++, *(static_cast(input1->buffer()))); + concat_kernel.setArg(idx++, static_cast(input0->dim(3))); + concat_kernel.setArg(idx++, *(static_cast(output->buffer()))); + + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); + + uint32_t lws[3]; + lws[0] = std::min(channel_blk, kwg_size); + lws[1] = std::min(width, kwg_size / lws[0]); + lws[2] = std::min(height * batch, kwg_size / (lws[0] * lws[1])); + + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + concat_kernel, cl::NullRange, + cl::NDRange(static_cast(channel_blk), + static_cast(width), + static_cast(height * batch)), + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + MACE_CHECK(error == CL_SUCCESS); +} + +template +void ConcatFunctor::operator()(const std::vector &input_list, + Tensor *output) { + const int inputs_count = input_list.size(); + MACE_CHECK(inputs_count == 2 && axis_ == 3) + << "Concat opencl kernel only support two elements with axis == 3"; + + const Tensor *input0 = input_list[0]; + + std::vector output_shape(input0->shape()); + for (int i = 1; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + MACE_CHECK(input->dim_size() == input0->dim_size(), + "Ranks of all input tensors must be same."); + for (int j = 0; j < input->dim_size(); ++j) { + if (j == axis_) { + continue; + } + MACE_CHECK(input->dim(j) == input0->dim(j), + "Dimensions of inputs should equal except axis."); + } + output_shape[axis_] += input->dim(axis_); + } + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, image_shape); + output->ResizeImage(output_shape, image_shape); + + switch (inputs_count) { + case 2: + Concat2(input_list[0], input_list[1], DataTypeToEnum::value, output); + break; + default:MACE_NOT_IMPLEMENTED; + } +}; + +template +struct ConcatFunctor; +template +struct ConcatFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index df040904bff47587143f4580c07516444341a7b6..929da85f2f36c8c8cff02608523475e14f5cbc47 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -7,8 +7,20 @@ namespace mace { REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat") - .TypeConstraint("T") - .Build(), + .TypeConstraint("T") + .Build(), ConcatOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat") + .TypeConstraint("T") + .Build(), + ConcatOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Concat") + .TypeConstraint("T") + .Build(), + ConcatOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Concat") + .TypeConstraint("T") + .Build(), + ConcatOp); } // namespace mace diff --git a/mace/ops/concat.h b/mace/ops/concat.h index 2b82c0cb1647a0505b6125ef0b837d1136af53a1..77e430304c93341c176dd732c30559f0721e4f8a 100644 --- a/mace/ops/concat.h +++ b/mace/ops/concat.h @@ -14,53 +14,23 @@ template class ConcatOp : public Operator { public: ConcatOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + : Operator(op_def, ws), + functor_(OperatorBase::GetSingleArgument("axis", 3)){} bool Run() override { - int32_t values_count = this->InputSize() - 1; - const Tensor *input0 = this->Input(0); - const Tensor *axis_tensor = this->Input(values_count); - MACE_CHECK(axis_tensor->dim_size() == 0, - "axis should be a scalar integer, but got shape: ", - axis_tensor->dim_size()); - const int32_t concat_axis = *(axis_tensor->data()); - const int32_t input_dims = input0->dim_size(); + MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat"; + const std::vector input_list = this->Inputs(); + const int32_t concat_axis = OperatorBase::GetSingleArgument("axis", 3); + const int32_t input_dims = input_list[0]->dim_size(); const int32_t axis = concat_axis < 0 ? concat_axis + input_dims : concat_axis; MACE_CHECK((0 <= axis && axis < input_dims), "Expected concatenating axis in the range [", -input_dims, ", ", input_dims, "], but got", concat_axis); - std::vector output_shape(input0->shape()); - index_t inner_size = 1; - for (int i = 0; i < axis; ++i) { - inner_size *= output_shape[i]; - } - std::vector outer_sizes(values_count, 0); - std::vector input_list(values_count, nullptr); - input_list[0] = input0->data(); - outer_sizes[0] = input0->size() / inner_size; - const Tensor *input = nullptr; - for (int i = 1; i < values_count; ++i) { - input = this->Input(i); - MACE_CHECK(input->dim_size() == input0->dim_size(), - "Ranks of all input tensors must be same."); - for (int j = 0; j < axis_tensor->dim_size(); ++j) { - if (j == axis) { - continue; - } - MACE_CHECK(input->dim(j) == input0->dim(j), - "Dimensions of inputs should equal except axis."); - } - input_list[i] = input->data(); - outer_sizes[i] = input->size() / inner_size; - output_shape[axis] += input->dim(axis); - } Tensor *output = this->Output(OUTPUT); - output->Resize(output_shape); - functor_(input_list, inner_size, outer_sizes.data(), - output->mutable_data()); + functor_(input_list, output); return true; } diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 275886a6d345293b5a8a965ef442ea99932a8fba..0e4425251426e8056a2973352ded79cd2b4f88ef 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -15,7 +15,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { OpDefBuilder("Concat", "ConcatBM") .Input("Input0") .Input("Input1") - .Input("Axis") + .AddIntArg("axis", concat_dim) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -23,7 +23,6 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { const int kDim0 = 100; net.AddRandomInput("Input0", {kDim0, dim1}); net.AddRandomInput("Input1", {kDim0, dim1}); - net.AddInputFromArray("Axis", {}, {concat_dim}); // Warm-up for (int i = 0; i < 5; ++i) { @@ -38,14 +37,65 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { } } -static void BM_ConcatDim0Float(int iters, int dim1) { +static void BM_CONCAT_Dim0Float(int iters, int dim1) { ConcatHelper(iters, 0, dim1); } -static void BM_ConcatDim1Float(int iters, int dim1) { +static void BM_CONCAT_Dim1Float(int iters, int dim1) { ConcatHelper(iters, 1, dim1); } -BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000); -BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000); +BENCHMARK(BM_CONCAT_Dim0Float)->Arg(1000)->Arg(100000); +BENCHMARK(BM_CONCAT_Dim1Float)->Arg(1000)->Arg(100000); + +template +static void OpenclConcatHelper(int iters, + const std::vector &shape0, + const std::vector &shape1, + int concat_dim) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input0", shape0); + net.AddRandomInput("Input1", shape1); + + BufferToImage(net, "Input0", "InputImage0", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input1", "InputImage1", kernels::BufferType::IN_OUT); + OpDefBuilder("Concat", "ConcatBM") + .Input("InputImage0") + .Input("InputImage1") + .AddIntArg("axis", concat_dim) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(DeviceType::OPENCL); + } + + const int64_t tot = static_cast(iters) * + (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); + mace::testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(T)); + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(DeviceType::OPENCL); + } +} + +static void BM_CONCATOPENCLFloat(int iters, int dim1) { + std::vector shape = {3, 32, 32, dim1}; + OpenclConcatHelper(iters, shape, shape, 3); +} + +static void BM_CONCATOPENCLHalf(int iters, int dim1) { + std::vector shape = {3, 32, 32, dim1}; + OpenclConcatHelper(iters, shape, shape, 3); +} + +BENCHMARK(BM_CONCATOPENCLFloat)->Arg(32)->Arg(64)->Arg(128)->Arg(256); +BENCHMARK(BM_CONCATOPENCLHalf)->Arg(32)->Arg(64)->Arg(128)->Arg(256); } // namespace mace \ No newline at end of file diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 1d94d4679a5f9292b744444f1a24e3719f35f3c1..8a42899e49183dd35fcfaf804679b4807219688b 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -10,13 +10,13 @@ using namespace mace; class ConcatOpTest : public OpsTestBase {}; -TEST_F(ConcatOpTest, Simple_Horizon) { +TEST_F(ConcatOpTest, CPUSimpleHorizon) { // Construct graph auto &net = test_net(); OpDefBuilder("Concat", "ConcatTest") .Input("Input0") .Input("Input1") - .Input("Axis") + .AddIntArg("axis", 0) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -28,7 +28,6 @@ TEST_F(ConcatOpTest, Simple_Horizon) { // Add inputs net.AddInputFromArray("Input0", input_shape, input0); net.AddInputFromArray("Input1", input_shape, input1); - net.AddInputFromArray("Axis", {}, {0}); // Run net.RunOp(); @@ -48,13 +47,13 @@ TEST_F(ConcatOpTest, Simple_Horizon) { } } -TEST_F(ConcatOpTest, Simple_Vertical) { +TEST_F(ConcatOpTest, CPUSimpleVertical) { // Construct graph auto &net = test_net(); OpDefBuilder("Concat", "ConcatTest") .Input("Input0") .Input("Input1") - .Input("Axis") + .AddIntArg("axis", 1) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -66,7 +65,6 @@ TEST_F(ConcatOpTest, Simple_Vertical) { // Add inputs net.AddInputFromArray("Input0", input_shape, input0); net.AddInputFromArray("Input1", input_shape, input1); - net.AddInputFromArray("Axis", {}, {1}); // Run net.RunOp(); @@ -88,7 +86,7 @@ TEST_F(ConcatOpTest, Simple_Vertical) { } } -TEST_F(ConcatOpTest, Random) { +TEST_F(ConcatOpTest, CPURandom) { srand(time(nullptr)); int dim = 5; int num_inputs = 2 + rand() % 10; @@ -99,7 +97,7 @@ TEST_F(ConcatOpTest, Random) { for (int i = 0; i < num_inputs; ++i) { builder = builder.Input(("Input" + ToString(i)).c_str()); } - builder.Input("Axis").Output("Output").Finalize(net.NewOperatorDef()); + builder.AddIntArg("axis", axis).Output("Output").Finalize(net.NewOperatorDef()); std::vector shape_data; GenerateRandomIntTypeData({dim}, shape_data, 1, dim); @@ -115,7 +113,6 @@ TEST_F(ConcatOpTest, Random) { net.AddInputFromArray(("Input" + ToString(i)).c_str(), input_shapes[i], inputs[i]); } - net.AddInputFromArray("Axis", {}, {axis}); // Run net.RunOp(); @@ -139,3 +136,87 @@ TEST_F(ConcatOpTest, Random) { } } } + +template +void OpenclRandomTest(const std::vector> &shapes, + const int axis) { + srand(time(nullptr)); + int num_inputs = 2; + int concat_axis_size = 0; + // Construct graph + OpsTestNet net; + for (int i = 0; i < num_inputs; ++i) { + const std::string input_name = ("Input" + ToString(i)).c_str(); + const std::string image_name = ("InputImage" + ToString(i)).c_str(); + concat_axis_size += shapes[i][axis]; + net.AddRandomInput(input_name, + shapes[i]); + BufferToImage(net, input_name, image_name, kernels::BufferType::IN_OUT); + } + + auto builder = OpDefBuilder("Concat", "ConcatTest"); + for (int i = 0; i < num_inputs; ++i) { + const std::string image_name = ("InputImage" + ToString(i)).c_str(); + builder = builder.Input(image_name); + } + builder.AddIntArg("axis", axis) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(DeviceType::OPENCL); + + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + // Check + auto output = net.GetOutput("Output"); + + std::vector expected_shape = shapes[0]; + expected_shape[axis] = concat_axis_size; + EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape)); + + Tensor::MappingGuard output_mapper(output); + const float *output_ptr = output->data(); + int k = 0; + while (output_ptr != (output->data() + output->size())) { + for (int i = 0; i < num_inputs; ++i) { + index_t num_elements = + std::accumulate(shapes[i].begin() + axis, shapes[i].end(), + 1, std::multiplies()); + + const std::string input_name = ("Input" + ToString(i)).c_str(); + const Tensor *input_tensor = net.GetTensor(input_name.data()); + Tensor::MappingGuard input_guard(input_tensor); + const float *input_ptr = input_tensor->data() + k * num_elements; + for (int j = 0; j < num_elements; ++j) { + EXPECT_NEAR(*(input_ptr + j), *output_ptr++, 1e-2) << "With index: " << i << ", " << j; + } + } + k++; + } +} + +TEST_F(ConcatOpTest, OPENCLAligned) { + OpenclRandomTest({ + {3, 32, 32, 32}, + {3, 32, 32, 64} + }, + 3); +} + +TEST_F(ConcatOpTest, OPENCLHalfAligned) { + OpenclRandomTest({ + {3, 32, 32, 32}, + {3, 32, 32, 64} + }, + 3); +} + +TEST_F(ConcatOpTest, OPENCLUnAligned) { + OpenclRandomTest({ + {3, 32, 32, 13}, + {3, 32, 32, 17} + }, + 3); +}