diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index d388e2c5385f0137538877db61a8cde4a99396af..7ca799e2e8701b8adb439218c17ce10d8fbd0f56 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -14,6 +14,7 @@ #include +#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/eltwise.h" #include "mace/ops/ops_test_util.h" @@ -531,6 +532,100 @@ TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) { {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25}); } +namespace { +template +void GPUOverflowTest(const ops::EltwiseType type, + const std::vector &shape0, + const std::vector &input0, + const std::vector &shape1, + const std::vector &input1, + const std::vector &output_shape, + const std::vector &output) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray("Input0", shape0, input0); + net.AddInputFromArray("Input1", shape1, input1); + + OpDefBuilder("Eltwise", "EltwiseTest") + .AddIntArg("T", DataTypeToEnum::v()) + .Input("Input0") + .Input("Input1") + .AddIntArg("type", static_cast(type)) + .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) + .Output("EltOutput") + .OutputShape(output_shape) + .Finalize(net.AddNewOperatorDef()); + net.AddInputFromArray( + "Filter", + {output_shape.back(), shape0.back(), 3, 3}, + std::vector(output_shape.back() * shape0.back() * 9, 1)); + OpDefBuilder("Conv2D", "Conv2D") + .AddIntArg("T", DataTypeToEnum::v()) + .Input("EltOutput") + .Input("Filter") + .Output("Output") + .OutputShape(output_shape) + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.AddNewOperatorDef()); + + // Run + net.RunOp(DeviceType::GPU); + + auto expected = net.CreateTensor(output_shape, output); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} +} // namespace +TEST_F(EltwiseOpTest, GPUOverflowTest) { + GPUOverflowTest( + ops::EltwiseType::SUM, {1, 2, 2, 2}, std::vector(8, 1), + {1, 1, 1, 2}, {1, 1}, + {1, 2, 2, 1}, {16, 16, 16, 16}); + GPUOverflowTest( + ops::EltwiseType::SUB, {2, 2, 2, 2}, std::vector(16, 1), + {2, 1, 1, 2}, {1, 1, 2, 2}, + {2, 2, 2, 1}, {0, 0, 0, 0, -8, -8, -8, -8}); + GPUOverflowTest( + ops::EltwiseType::PROD, {1, 3, 2, 1}, std::vector(6, 1), + {1, 3, 2, 1}, std::vector(6, 1), + {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4}); + GPUOverflowTest( + ops::EltwiseType::DIV, {2, 3, 2, 1}, std::vector(12, 1), + {2, 3, 2, 1}, std::vector(12, 1), + {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4}); + GPUOverflowTest( + ops::EltwiseType::MIN, {1, 2, 2, 2}, std::vector(8, 1), + {1, 1, 1, 2}, {1, 1}, + {1, 2, 2, 1}, {8, 8, 8, 8}); + GPUOverflowTest( + ops::EltwiseType::MAX, {2, 2, 2, 2}, std::vector(16, 1), + {2, 1, 1, 2}, {1, 1, 2, 2}, + {2, 2, 2, 1}, {8, 8, 8, 8, 16, 16, 16, 16}); + GPUOverflowTest( + ops::EltwiseType::NEG, {1, 3, 2, 1}, std::vector(6, 1), + {1, 1, 1, 1}, {0}, + {1, 3, 2, 1}, {-4, -4, -6, -6, -4, -4}); + GPUOverflowTest( + ops::EltwiseType::ABS, {2, 3, 2, 1}, std::vector(12, -1), + {1, 1, 1, 1}, {0}, + {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4}); + GPUOverflowTest( + ops::EltwiseType::SQR_DIFF, {2, 2, 2, 2}, std::vector(16, 1), + {2, 1, 1, 2}, {1, 1, 2, 2}, + {2, 2, 2, 1}, {0, 0, 0, 0, 8, 8, 8, 8}); + GPUOverflowTest( + ops::EltwiseType::POW, {1, 3, 2, 1}, std::vector(6, 1), + {1, 3, 2, 1}, std::vector(6, 1), + {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4}); + GPUOverflowTest( + ops::EltwiseType::FLOOR_DIV, {2, 2, 2, 2}, std::vector(16, 1), + {2, 1, 1, 2}, {1, 1, 2, 2}, + {2, 2, 2, 1}, {8, 8, 8, 8, 0, 0, 0, 0}); +} + namespace { template void RandomTensorScalar(const ops::EltwiseType type, diff --git a/mace/ops/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl index 23ab75591a09e6b824c83dfacdfe43f32f5bf5d3..167e5341a8aa0737dccc1fa9c9ef865ddae55c34 100644 --- a/mace/ops/opencl/cl/eltwise.cl +++ b/mace/ops/opencl/cl/eltwise.cl @@ -3,7 +3,7 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, -#if INPUT_TYPE == 1 +#if defined(INPUT_SCALAR) __private const float value, #else __read_only image2d_t input1, @@ -28,14 +28,14 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS const int pos = mad24(chan_idx, width, width_idx); DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(pos, hb)); -#if INPUT_TYPE == 1 +#if defined(INPUT_SCALAR) DATA_TYPE4 in1 = (DATA_TYPE4)(value, value, value, value); -#elif INPUT_TYPE == 2 +#elif defined(INPUT_VECTOR) + DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0)); +#elif defined(INPUT_BATCH_VECTOR) const int batch_idx = hb / height; DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, batch_idx)); -#elif INPUT_TYPE == 3 - DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0)); -#elif INPUT_TYPE == 4 +#elif defined(INPUT_TENSOR_BC_CHAN) DATA_TYPE4 tmp = READ_IMAGET(input1, SAMPLER, (int2)(width_idx, hb)); DATA_TYPE4 in1 = (DATA_TYPE4)(tmp.x, tmp.x, tmp.x, tmp.x); #else @@ -89,11 +89,11 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS #endif #endif -#if ((INPUT_TYPE == 1 || INPUT_TYPE == 4) && \ - (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 || \ - ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8 || ELTWISE_TYPE == 9)) || \ - ((INPUT_TYPE != 1 || INPUT_TYPE != 4) && \ - (ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11)) +#if defined(NOT_DIVISIBLE_FOUR) && \ + ((ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11) \ + || ((defined(INPUT_SCALAR) || defined(INPUT_TENSOR_BC_CHAN)) && \ + (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 || \ + ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8))) const int remain_channel = channel - 4 * chan_idx; if (remain_channel < 4) { switch (remain_channel) { diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h index 8e6d3bc758fcaf4167aa841e83c9194687314330..bc1a702532fcfec6f32866fc332bdfe717f79416 100644 --- a/mace/ops/opencl/image/eltwise.h +++ b/mace/ops/opencl/image/eltwise.h @@ -67,25 +67,52 @@ MaceStatus EltwiseKernel::Compute( const Tensor *input1, Tensor *output) { bool swapped = false; - if (input1 != nullptr) { + std::string input1_type = ""; + if (input1 == nullptr) { + input1_type = "INPUT_SCALAR"; + } else { MACE_CHECK(input0->dim_size() == input1->dim_size() || input0->dim_size() == 1 || input1->dim_size() == 1) << "Inputs of Eltwise op must be same shape"; + MACE_CHECK(type_ != EltwiseType::EQUAL) + << "Eltwise op on GPU does not support EQUAL"; + // broadcast if (input0->size() != input1->size()) { if (input0->size() < input1->size()) { std::swap(input0, input1); swapped = true; } - if (input1->dim_size() == 1) { - MACE_CHECK(input0->dim(3) == input1->dim(0)) - << "Element-Wise op support broadcast on only-channel or non-channel dimension"; // NOLINT(whitespace/line_length) - } else { - MACE_CHECK(((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) - && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 && - input1->dim(2) == 1) || (input0->dim(0) == input1->dim(0) && - input0->dim(1) == input1->dim(1) && input0->dim(2) == input1->dim(2) - && input1->dim(3) == 1)) - << "Element-Wise op support broadcast on only-channel or non-channel dimension"; // NOLINT(whitespace/line_length) + if (input1->dim_size() == 1 + || (input1->dim(0) == 1 && input1->dim(1) == 1 + && input1->dim(2) == 1)) { + // Tensor-Vector element wise + if (input0->dim(3) == input1->dim(input1->dim_size()-1)) { + input1_type = "INPUT_VECTOR"; + } else { + LOG(FATAL) << "Inputs not match the broadcast logic, " + << MakeString(input0->shape()) << " vs " + << MakeString(input1->shape()); + } + } else { // must be 4-D + if (input0->dim(0) == input1->dim(0) + && input1->dim(1) == 1 + && input1->dim(2) == 1 + && input0->dim(3) == input1->dim(3)) { + input1_type = "INPUT_BATCH_VECTOR"; + } else if (input0->dim(0) == input1->dim(0) + && input0->dim(1) == input1->dim(1) + && input0->dim(2) == input1->dim(2) + && input1->dim(3) == 1) { + // broadcast on channel dimension + input1_type = "INPUT_TENSOR_BC_CHAN"; + } else { + LOG(FATAL) << "Element-Wise op only support broadcast on" + " channel dimension:" + "Tensor-BatchVector(4D-[N,1,1,C]) " + "and Tensor-Tensor(4D-[N,H,W,1]). but got " + << MakeString(input0->shape()) << " vs " + << MakeString(input1->shape()); + } } } } @@ -129,20 +156,11 @@ MaceStatus EltwiseKernel::Compute( built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); - if (input1 == nullptr) { - built_options.emplace("-DINPUT_TYPE=1"); - } else if (input0->size() != input1->size()) { - if (input0->dim(0) == input1->dim(0) && input0->dim(1) == input1->dim(1) - && input0->dim(2) == input1->dim(2) && input1->dim(3) == 1) { - // only broadcast on channel - built_options.emplace("-DINPUT_TYPE=4"); - } else if (input1->dim(0) == 1 || input1->dim_size() == 1) { - built_options.emplace("-DINPUT_TYPE=3"); - } else { - built_options.emplace("-DINPUT_TYPE=2"); - } - if (swapped) built_options.emplace("-DSWAPPED"); + if (!input1_type.empty()) { + built_options.emplace("-D" + input1_type); } + if (swapped) built_options.emplace("-DSWAPPED"); + if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, built_options, &kernel_)); diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index 0604bdc410925977224a5f82763b6234ec40722a..ce9c1bbde07ddd8857f33718f06eb47d1fb34fa9 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -159,16 +159,16 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() { bool OpsTestNet::Setup(mace::DeviceType device) { NetDef net_def; - for (auto &op_def_ : op_defs_) { - net_def.add_op()->CopyFrom(op_def_); + for (auto &op_def : op_defs_) { + net_def.add_op()->CopyFrom(op_def); - for (auto input : op_def_.input()) { + for (auto input : op_def.input()) { if (ws_.GetTensor(input) != nullptr && !ws_.GetTensor(input)->is_weight()) { auto input_info = net_def.add_input_info(); input_info->set_name(input); auto data_format = ProtoArgHelper::GetOptionalArg( - op_def_, "data_format", DataFormat::DF_NONE); + op_def, "data_format", DataFormat::DF_NONE); input_info->set_data_format(data_format); auto &shape = ws_.GetTensor(input)->shape(); for (auto d : shape) { @@ -176,16 +176,16 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } - - for (int i = 0; i < op_def_.output_size(); ++i) { - ws_.RemoveTensor(op_def_.output(i)); - auto output_info = net_def.add_output_info(); - output_info->set_name(op_def_.output(i)); - if (op_def_.output_type_size() == op_def_.output_size()) { - output_info->set_data_type(op_def_.output_type(i)); - } else { - output_info->set_data_type(DataType::DT_FLOAT); - } + } + auto op_def = op_defs_.back(); + for (int i = 0; i < op_def.output_size(); ++i) { + ws_.RemoveTensor(op_def.output(i)); + auto output_info = net_def.add_output_info(); + output_info->set_name(op_def.output(i)); + if (op_def.output_type_size() == op_def.output_size()) { + output_info->set_data_type(op_def.output_type(i)); + } else { + output_info->set_data_type(DataType::DT_FLOAT); } } MemoryOptimizer mem_optimizer;