From f8dfff0d6f1c3d91e5d424e88bc36fb6b333656e Mon Sep 17 00:00:00 2001 From: liuqi Date: Sun, 11 Feb 2018 16:01:22 +0800 Subject: [PATCH] Add eltwise op (only support two operands) --- mace/core/operator.cc | 2 + mace/kernels/eltwise.h | 105 +++++++++++++++ mace/kernels/opencl/cl/eltwise.cl | 34 +++++ mace/kernels/opencl/eltwise_opencl.cc | 69 ++++++++++ mace/ops/eltwise.cc | 29 ++++ mace/ops/eltwise.h | 47 +++++++ mace/ops/eltwise_benchmark.cc | 78 +++++++++++ mace/ops/eltwise_test.cc | 187 ++++++++++++++++++++++++++ 8 files changed, 551 insertions(+) create mode 100644 mace/kernels/eltwise.h create mode 100644 mace/kernels/opencl/cl/eltwise.cl create mode 100644 mace/kernels/opencl/eltwise_opencl.cc create mode 100644 mace/ops/eltwise.cc create mode 100644 mace/ops/eltwise.h create mode 100644 mace/ops/eltwise_benchmark.cc create mode 100644 mace/ops/eltwise_test.cc diff --git a/mace/core/operator.cc b/mace/core/operator.cc index e0dc1143..e9a5f1b6 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -81,6 +81,7 @@ extern void Register_MatMul(OperatorRegistry *op_registry); extern void Register_WinogradTransform(OperatorRegistry *op_registry); extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry); extern void Register_Reshape(OperatorRegistry *op_registry); +extern void Register_Eltwise(OperatorRegistry *op_registry); OperatorRegistry::OperatorRegistry() { Register_Activation(this); @@ -105,6 +106,7 @@ OperatorRegistry::OperatorRegistry() { Register_WinogradTransform(this); Register_WinogradInverseTransform(this); Register_Reshape(this); + Register_Eltwise(this); } } // namespace mace diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h new file mode 100644 index 00000000..18f0604c --- /dev/null +++ b/mace/kernels/eltwise.h @@ -0,0 +1,105 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// +#ifndef MACE_KERNELS_ELTWISE_H_ +#define MACE_KERNELS_ELTWISE_H_ + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/cl2_header.h" + +namespace mace { +namespace kernels { + +enum EltwiseType{ + PROD = 0, + SUM = 1, + MAX = 2, + MIN = 3, +}; + +struct EltwiseFunctorBase { + EltwiseFunctorBase(const EltwiseType type, + const std::vector &coeff) + : type_(type), coeff_(coeff) {} + + EltwiseType type_; + std::vector coeff_; +}; + +template +struct EltwiseFunctor : EltwiseFunctorBase { + EltwiseFunctor(const EltwiseType type, + const std::vector &coeff) + : EltwiseFunctorBase(type, coeff) {} + + void operator()(const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) { + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + const T *input0_ptr = input0->data(); + const T *input1_ptr = input1->data(); + T *output_ptr = output->mutable_data(); + const index_t size = input0->size(); + + switch (type_) { + case PROD: +#pragma omp parallel for + for(index_t i = 0; i < size; ++i) { + output_ptr[i] = input0_ptr[i] * input1_ptr[i]; + } + break; + case SUM: + if (coeff_.empty()) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = input0_ptr[i] + input1_ptr[i]; + } + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i]; + } + } + break; + case MAX: +#pragma omp parallel for + for(index_t i = 0; i < size; ++i) { + output_ptr[i] = std::max(input0_ptr[i], input1_ptr[i]); + } + break; + case MIN: +#pragma omp parallel for + for(index_t i = 0; i < size; ++i) { + output_ptr[i] = std::min(input0_ptr[i], input1_ptr[i]); + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type_; + } + } +}; + + +template +struct EltwiseFunctor: EltwiseFunctorBase { + EltwiseFunctor(const EltwiseType type, + const std::vector &coeff) + : EltwiseFunctorBase(type, coeff) {} + + void operator()(const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future); + + cl::Kernel kernel_; +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ELTWISE_H_ diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl new file mode 100644 index 00000000..735bc96e --- /dev/null +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -0,0 +1,34 @@ +#include + +__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t input1, +#ifdef COEFF_SUM + __private const float coeff0, + __private const float coeff1, +#endif + __write_only image2d_t output) { + const int w = get_global_id(0); + const int hb = get_global_id(1); + + DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); + DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); + DATA_TYPE4 out; +#if ELTWISE_TYPE == 0 + out = in0 * in1; +#elif ELTWISE_TYPE == 1 + +#ifdef COEFF_SUM + out = mad(coeff0, in0, mad(coeff1, in1, 0)); +#else + out = in0 + in1; +#endif + +#elif ELTWISE_TYPE == 2 + out = fmax(in0, in1); +#elif ELTWISE_TYPE == 3 + out = fmin(in0, in1); +#endif + + WRITE_IMAGET(output, (int2)(w, hb), out); +} + diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc new file mode 100644 index 00000000..43356df3 --- /dev/null +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -0,0 +1,69 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/eltwise.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { + +template +void EltwiseFunctor::operator()(const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) { + + const index_t batch = input0->dim(0); + const index_t height = input0->dim(1); + const index_t width = input0->dim(2); + const index_t channels = input0->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_pixels = channel_blocks * width; + const index_t batch_height_pixels = batch * height; + + if (kernel_.get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); + built_options.emplace("-Deltwise=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DELTWISE_TYPE=" + ToString(type_)); + if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); + kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); + + uint32_t idx = 0; + kernel_.setArg(idx++, + *(static_cast(input0->buffer()))); + kernel_.setArg(idx++, + *(static_cast(input1->buffer()))); + if (!coeff_.empty()) { + kernel_.setArg(idx++, coeff_[0]); + kernel_.setArg(idx++, coeff_[1]); + } + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + } + + const uint32_t gws[2] = { + static_cast(width_pixels), + static_cast(batch_height_pixels) + }; + const std::vector lws = {64, 16, 1}; + std::stringstream ss; + ss << "eltwise_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); +} + +template struct EltwiseFunctor; +template struct EltwiseFunctor; +} // namespace kernels +} // namespace mace diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc new file mode 100644 index 00000000..0304ec1a --- /dev/null +++ b/mace/ops/eltwise.cc @@ -0,0 +1,29 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/eltwise.h" + +namespace mace { + +void Register_Eltwise(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + EltwiseOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + EltwiseOp); +} + +} // namespace mace diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h new file mode 100644 index 00000000..7d8e63ee --- /dev/null +++ b/mace/ops/eltwise.h @@ -0,0 +1,47 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_RESHAPE_H_ +#define MACE_OPS_RESHAPE_H_ + +#include "mace/core/operator.h" +#include "mace/kernels/eltwise.h" + +namespace mace { + +template +class EltwiseOp : public Operator { + public: + EltwiseOp(const OperatorDef &op_def, Workspace *ws) + : Operator(op_def, ws), + functor_(static_cast( + OperatorBase::GetSingleArgument( + "type", static_cast(kernels::EltwiseType::SUM))), + OperatorBase::GetRepeatedArgument("coeff")){} + + bool Run(StatsFuture *future) override { + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->Input(1); + Tensor *output = this->Output(OUTPUT); + MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape"; + for(int i = 0; i < input0->dim_size(); ++i) { + MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape"; + } + + output->ResizeLike(input0); + + functor_(input0, input1, output, future); + return true; + } + + private: + kernels::EltwiseFunctor functor_; + + private: + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_RESHAPE_H_ diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc new file mode 100644 index 00000000..8cd22233 --- /dev/null +++ b/mace/ops/eltwise_benchmark.cc @@ -0,0 +1,78 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" +#include "mace/kernels/eltwise.h" + +namespace mace { +template +static void EltwiseBenchmark(int iters, kernels::EltwiseType type, int n, int h, int w, int c) { + mace::testing::StopTiming(); + + OpsTestNet net; + // Add input data + net.AddRandomInput("Input0", {n, h, w, c}); + net.AddRandomInput("Input1", {n, h, w, c}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input0", "InputImg0", kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("InputImg0") + .Input("InputImg1") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", {1.2, 2.1}) + .AddIntArg("T", static_cast(DT_HALF)) + .Output("OutputImg") + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("Input0") + .Input("Input1") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", {1.2, 2.1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + net.Sync(); + } + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + net.Sync(); + } +} + +#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \ + static void BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + EltwiseBenchmark(iters, static_cast(ELT_TYPE), N, H, W, C); \ + } \ + BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define BM_ELTWISE(ELT_TYPE, N, H, W, C, TYPE) \ + BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, CPU); \ + BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, OPENCL); + +BM_ELTWISE(0, 1, 256, 256, 32, float); +BM_ELTWISE(0, 1, 128, 128, 32, float); +BM_ELTWISE(1, 1, 128, 128, 32, float); +BM_ELTWISE(2, 1, 128, 128, 32, float); +BM_ELTWISE(0, 1, 240, 240, 256, float); +BM_ELTWISE(1, 1, 240, 240, 256, float); +BM_ELTWISE(2, 1, 240, 240, 256, float); + +} // namespace mace diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc new file mode 100644 index 00000000..3e3d3362 --- /dev/null +++ b/mace/ops/eltwise_test.cc @@ -0,0 +1,187 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" +#include "mace/kernels/eltwise.h" + +namespace mace { + +class EltwiseOpTest : public OpsTestBase {}; + +template +void Simple(const kernels::EltwiseType type, + const std::vector &shape, + const std::vector &input0, + const std::vector &input1, + const std::vector &output, + const std::vector coeff = {}) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray("Input1", shape, input0); + net.AddInputFromArray("Input2", shape, input1); + + if (D == DeviceType::CPU) { + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("Input1") + .Input("Input2") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", coeff) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } else { + BufferToImage(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("InputImg1") + .Input("InputImg2") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", coeff) + .Output("OutputImg") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL); + } + + auto expected = CreateTensor(shape, output); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3); +} + +TEST_F(EltwiseOpTest, CPUSimple) { + Simple(kernels::EltwiseType::PROD, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {1, 4, 9, 16, 25, 36}); + Simple(kernels::EltwiseType::SUM, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {2, 4, 6, 8, 10, 12}); + Simple(kernels::EltwiseType::SUM, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {3, 6, 9, 12, 15, 18}, + {2, 1}); + Simple(kernels::EltwiseType::MAX, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 1, 3, 3, 6, 6}, + {1, 2, 3, 4, 6, 6}); + Simple(kernels::EltwiseType::MIN, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 1, 3, 3, 6, 6}, + {1, 1, 3, 3, 5, 6}); +} + +TEST_F(EltwiseOpTest, GPUSimple) { + Simple(kernels::EltwiseType::PROD, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {1, 4, 9, 16, 25, 36}); + Simple(kernels::EltwiseType::SUM, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {2, 4, 6, 8, 10, 12}); + Simple(kernels::EltwiseType::SUM, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}, + {3, 6, 9, 12, 15, 18}, + {2, 1}); + Simple(kernels::EltwiseType::MAX, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 1, 3, 3, 6, 6}, + {1, 2, 3, 4, 6, 6}); + Simple(kernels::EltwiseType::MIN, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + {1, 1, 3, 3, 6, 6}, + {1, 1, 3, 3, 5, 6}); +} + +template +void RandomTest(const kernels::EltwiseType type, + const std::vector &shape) { + testing::internal::LogToStderr(); + srand(time(NULL)); + + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input1", shape); + net.AddRandomInput("Input2", shape); + + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("Input1") + .Input("Input2") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", {1.2, 2.1}) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + BufferToImage(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("InputImg1") + .Input("InputImg2") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", {1.2, 2.1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Output("OutputImg") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImg", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); + + if (DataTypeToEnum::value == DT_FLOAT) { + ExpectTensorNear(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-3); + } else { + ExpectTensorNear(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-1); + } +} + +TEST_F(EltwiseOpTest, OPENCLRandomFloat) { + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}); + RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}); +} + +TEST_F(EltwiseOpTest, OPENCLRandomHalf) { + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}); + RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}); +} + +} // namespace mace -- GitLab