diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc index fd1f50c3910a6878505bee7c0655f346b644c5ff..8b8a79ea01f5529f3a8cec2918171df1f1247bcc 100644 --- a/mace/core/allocator.cc +++ b/mace/core/allocator.cc @@ -16,10 +16,12 @@ void SetCPUAllocator(CPUAllocator* alloc) { } Allocator* GetDeviceAllocator(DeviceType type) { - if (type == DeviceType::CPU) { - return cpu_allocator(); - } else { - REQUIRE(false, "device type ", type, " is not supported."); + switch (type) { + case DeviceType::CPU: + case DeviceType::NEON: + return cpu_allocator(); + default: + REQUIRE(false, "device type ", type, " is not supported."); } return nullptr; } diff --git a/mace/core/allocator.h b/mace/core/allocator.h index 110b012bec4ea91663c17e14651519abc1a0f9f4..67c96539065906286e649ea9efbc1bc3ca22548a 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -12,8 +12,13 @@ namespace mace { -// 16 bytes = 32 * 4 (Neon) +#ifdef __ANDROID__ +// 16 bytes = 128 bits = 32 * 4 (Neon) constexpr size_t kMaceAlignment = 16; +#else +// 32 bytes = 256 bits (AVX512) +constexpr size_t kMaceAlignment = 32; +#endif class Allocator { public: @@ -41,27 +46,20 @@ class CPUAllocator: public Allocator { void* data = nullptr; #ifdef __ANDROID__ data = memalign(kMaceAlignment, nbytes); -#elif defined(_MSC_VER) - data = _aligned_malloc(nbytes, kMaceAlignment); #else CHECK(posix_memalign(&data, kMaceAlignment, nbytes) == 0); #endif CHECK_NOTNULL(data); + // TODO(heliangliang) This should be avoided sometimes memset(data, 0, nbytes); return data; } -#ifdef _MSC_VER - void Delete(void* data) { - _aligned_free(data); - } -#else - void Delete(void* data) { + void Delete(void* data) override { free(data); } -#endif - void CopyBytes(void* dst, const void* src, size_t size) { + void CopyBytes(void* dst, const void* src, size_t size) override { memcpy(dst, src, size); } }; @@ -80,6 +78,11 @@ struct DeviceContext { static Allocator* allocator() { return cpu_allocator(); } }; +template <> +struct DeviceContext { + static Allocator* allocator() { return cpu_allocator(); } +}; + Allocator* GetDeviceAllocator(DeviceType type); } // namespace mace diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 0072b58add999b1b60c7a1ea0a3bef0172931909..2e5086ac222a70503bf655ff9d92557369beccb4 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -18,6 +18,13 @@ MACE_DEFINE_REGISTRY( Workspace*); MACE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry); +MACE_DEFINE_REGISTRY( + NEONOperatorRegistry, + OperatorBase, + const OperatorDef&, + Workspace*); +MACE_REGISTER_DEVICE_TYPE(DeviceType::NEON, NEONOperatorRegistry); + unique_ptr CreateOperator( const OperatorDef& operator_def, Workspace* ws, @@ -33,4 +40,4 @@ OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) } -} // namespace mace \ No newline at end of file +} // namespace mace diff --git a/mace/core/operator.h b/mace/core/operator.h index 27e1fa16a772481406b0ce665bb61c1f620818b8..e937ebd97a6295d8436627670c52f91a3528679d 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -105,7 +105,7 @@ class Operator : public OperatorBase { DataTypeToEnum::v()))); } } - virtual bool Run() = 0; + virtual bool Run() override = 0; ~Operator() noexcept override {} }; @@ -145,6 +145,17 @@ MACE_DECLARE_REGISTRY( #define REGISTER_CPU_OPERATOR(name, ...) \ MACE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__) +MACE_DECLARE_REGISTRY( + NEONOperatorRegistry, + OperatorBase, + const OperatorDef&, + Workspace*); + +#define REGISTER_NEON_OPERATOR_CREATOR(key, ...) \ + MACE_REGISTER_CREATOR(NEONOperatorRegistry, key, __VA_ARGS__) +#define REGISTER_NEON_OPERATOR(name, ...) \ + MACE_REGISTER_CLASS(NEONOperatorRegistry, name, __VA_ARGS__) + unique_ptr CreateOperator( const OperatorDef &operator_def, Workspace *ws, diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..a842356a25ac5f8ef94274e51eb294032cb89139 --- /dev/null +++ b/mace/kernels/BUILD @@ -0,0 +1,22 @@ +# Description: +# Mace neon kernels. +# +package( + default_visibility = ["//visibility:public"], +) + + +licenses(["notice"]) # Apache 2.0 + +load("//mace:mace.bzl", "if_android") + +cc_library( + name = "kernels", + srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])), + hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])), + deps = [ + "//mace/core:core", + ], + copts = ['-std=c++11'], +) + diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc new file mode 100644 index 0000000000000000000000000000000000000000..845422179e9866f8ba14df1e111f2f497fb747da --- /dev/null +++ b/mace/kernels/neon/relu_neon.cc @@ -0,0 +1,30 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/kernels/neon/relu_neon.h" + +namespace mace { +namespace kernels{ + +void NeonReluFuntion_float(const Tensor *input_tensor, + Tensor *output_tensor) { + int64 size = input_tensor->size(); + output_tensor->ResizeLike(input_tensor); + const float* input = input_tensor->data(); + float* output = output_tensor->mutable_data(); + + float32x4_t _zero = vdupq_n_f32(0.f); + for (; size > 0; size--) { + float32x4_t _inp = vld1q_f32(input); + float32x4_t _outp = vmaxq_f32(_inp, _zero); + vst1q_f32(output, _outp); + + input += 4; + output += 4; + } +} + +} // namespace kernels +} // namespace mace \ No newline at end of file diff --git a/mace/kernels/neon/relu_neon.h b/mace/kernels/neon/relu_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..0be3be6f6d064d3d6515f147b7665dc91856db3a --- /dev/null +++ b/mace/kernels/neon/relu_neon.h @@ -0,0 +1,19 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_RELU_NEON_H_ +#define MACE_KERNELS_RELU_NEON_H_ + +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + +void NeonReluFuntion_float(const Tensor *input_tensor, + Tensor *output_tensor); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_RELU_NEON_H_ diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h new file mode 100644 index 0000000000000000000000000000000000000000..cc613f1dc867b5cf14d9d51d830506c87b39a93e --- /dev/null +++ b/mace/kernels/relu.h @@ -0,0 +1,28 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_RELU_H_ +#define MACE_KERNELS_RELU_H_ + +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + +template +void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { + int64 size = input_tensor->size(); + output_tensor->ResizeLike(input_tensor); + const float* input = input_tensor->data(); + float* output = output_tensor->mutable_data(); + + for (int64 i = 0; i < size; ++i) { + output[i] = std::max(input[i], static_cast(0)); + } +} + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_RELU_H_ \ No newline at end of file diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 7cde44fa7da76dafe9d5a5c87d3e064ae96f6bc3..1acbc1fd968cea8432dfa1ec79470f23b4873af3 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -17,6 +17,7 @@ cc_library( deps = [ "//mace/proto:cc_proto", "//mace/core:core", + "//mace/kernels:kernels", ], copts = ['-std=c++11'], alwayslink = 1, diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc index 94646e0f3eca5c21bd8eb4510f2c5a41c54f7a05..59d4e3b7de35f708c4439b687e15194c01eac6d3 100644 --- a/mace/ops/relu.cc +++ b/mace/ops/relu.cc @@ -4,25 +4,32 @@ #include "mace/ops/relu.h" #include "mace/proto/mace.pb.h" +#include "mace/kernels/relu.h" +#if __ARM_NEON +#include "mace/kernels/neon/relu_neon.h" +#endif // __ARM_NEON namespace mace { template <> bool ReluOp::Run() { - const Tensor* X = Input(0); - Tensor* Y = Output(0); - Y->ResizeLike(X); + const Tensor* input_tensor = Input(0); + Tensor* output_tensor = Output(0); + kernels::ReluFuntion(input_tensor, output_tensor); + return true; +} +REGISTER_CPU_OPERATOR(Relu, ReluOp); - const float* Xdata = X-> data(); - float* Ydata = Y->mutable_data(); - for (int i = 0; i < X->size(); ++i) { - Ydata[i] = std::max(Xdata[i], 0.f); - VLOG(0) << i << ": " << Xdata[i] << " " << Ydata[i]; - } +#if __ARM_NEON +template <> +bool ReluOp::Run() { + const Tensor* input_tensor = Input(0); + Tensor* output_tensor = Output(0); + kernels::NeonReluFuntion_float(input_tensor, output_tensor); return true; } - -REGISTER_CPU_OPERATOR(Relu, ReluOp); +REGISTER_NEON_OPERATOR(Relu, ReluOp); +#endif // __ARM_NEON } // namespace mace diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 10c37f12267b996a30265a40cbffbf89ef01bb2a..05c317d137a3d9819174f18084e186029cfe3fc3 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -3,8 +3,9 @@ syntax = "proto2"; package mace; enum DeviceType { - CPU = 0; // In default, we will use CPU. - GPU = 1; + CPU = 0; // In default, we will use CPU. + NEON = 1; + OPENCL = 2; } enum DataType { @@ -70,4 +71,4 @@ message NetDef { optional string version = 3; repeated Argument arg = 4; repeated TensorProto tensors = 5; -} \ No newline at end of file +}