diff --git a/mace/core/BUILD b/mace/core/BUILD index 218fd1bd2354a9936d243764c9cdb113389ba817..7f974ba4e89aa14085fde004eb8da1413d4de124 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -7,6 +7,8 @@ package( licenses(["notice"]) # Apache 2.0 +load("//mace:mace.bzl", "if_android") + cc_library( name = "core", srcs = glob([ @@ -19,6 +21,10 @@ cc_library( deps = [ "//mace/proto:cc_proto", ], + linkopts = if_android([ + "-llog", + "-pie", + ]), ) # Main program for tests diff --git a/mace/core/common.h b/mace/core/common.h index c2c2931660275634f894948927a3c1dd7e909204..e5e07225ab1b21165fe7b3b7f2ca809824a7e740 100644 --- a/mace/core/common.h +++ b/mace/core/common.h @@ -12,7 +12,6 @@ #include #include -#include "mace/core/integral_types.h" #include "mace/core/logging.h" using std::set; @@ -21,7 +20,7 @@ using std::string; using std::unique_ptr; using std::vector; -typedef int64 TIndex; +typedef int64_t index_t; // Disable the copy and assignment operator for a class. #ifndef DISABLE_COPY_AND_ASSIGN diff --git a/mace/core/integral_types.h b/mace/core/integral_types.h deleted file mode 100644 index 72298201ef68403a93dbdb4d41087ad0f669e7a7..0000000000000000000000000000000000000000 --- a/mace/core/integral_types.h +++ /dev/null @@ -1,19 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - - -#ifndef MACE_CORE_INTEGRAL_TYPES_H_ -#define MACE_CORE_INTEGRAL_TYPES_H_ - -typedef int8_t int8; -typedef int16_t int16; -typedef int32_t int32; -typedef int64_t int64; - -typedef uint8_t uint8; -typedef uint16_t uint16; -typedef uint32_t uint32; -typedef uint64_t uint64; - -#endif // MACE_CORE_INTEGRAL_TYPES_H_ diff --git a/mace/core/logging.cc b/mace/core/logging.cc index 5e0982d58e5d38fa1117b9d35ba2bec8a55dc092..f01d0980241187b2fcc2acb829e3c4b79f30b8d4 100644 --- a/mace/core/logging.cc +++ b/mace/core/logging.cc @@ -69,18 +69,18 @@ void LogMessage::GenerateLogMessage() { namespace { -// Parse log level (int64) from environment variable (char*) -int64 LogLevelStrToInt(const char* tf_env_var_val) { - if (tf_env_var_val == nullptr) { +// Parse log level (int64_t) from environment variable (char*) +int64_t LogLevelStrToInt(const char* mace_env_var_val) { + if (mace_env_var_val == nullptr) { return 0; } // Ideally we would use env_var / safe_strto64, but it is // hard to use here without pulling in a lot of dependencies, // so we use std:istringstream instead - string min_log_level(tf_env_var_val); + string min_log_level(mace_env_var_val); std::istringstream ss(min_log_level); - int64 level; + int64_t level; if (!(ss >> level)) { // Invalid vlog level setting, set level to default (0) level = 0; @@ -89,26 +89,26 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) { return level; } -int64 MinLogLevelFromEnv() { - const char* tf_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL"); - return LogLevelStrToInt(tf_env_var_val); +int64_t MinLogLevelFromEnv() { + const char* mace_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL"); + return LogLevelStrToInt(mace_env_var_val); } -int64 MinVLogLevelFromEnv() { - const char* tf_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL"); - return LogLevelStrToInt(tf_env_var_val); +int64_t MinVLogLevelFromEnv() { + const char* mace_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL"); + return LogLevelStrToInt(mace_env_var_val); } } // namespace LogMessage::~LogMessage() { // Read the min log level once during the first call to logging. - static int64 min_log_level = MinLogLevelFromEnv(); + static int64_t min_log_level = MinLogLevelFromEnv(); if (severity_ >= min_log_level) GenerateLogMessage(); } -int64 LogMessage::MinVLogLevel() { - static int64 min_vlog_level = MinVLogLevelFromEnv(); +int64_t LogMessage::MinVLogLevel() { + static int64_t min_vlog_level = MinVLogLevelFromEnv(); return min_vlog_level; } diff --git a/mace/core/logging.h b/mace/core/logging.h index c613a87d640618d689d6aadf04782f40a8172011..0787af3383d91074ae60c214f096923b8fc891d9 100644 --- a/mace/core/logging.h +++ b/mace/core/logging.h @@ -9,8 +9,6 @@ #include #include -#include "mace/core/integral_types.h" - #undef ERROR namespace mace { @@ -62,7 +60,7 @@ class LogMessage : public std::basic_ostringstream { // Returns the minimum log level for VLOG statements. // E.g., if MinVLogLevel() is 2, then VLOG(2) statements will produce output, // but VLOG(3) will not. Defaults to 0. - static int64 MinVLogLevel(); + static int64_t MinVLogLevel(); protected: void GenerateLogMessage(); diff --git a/mace/core/net.cc b/mace/core/net.cc index 22956876262c5b16a042e9e1289e6fd4fc73dca6..a8f1f80e17e2f150ff5b9dda32df3c03e97e8014 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -37,6 +37,7 @@ bool SimpleNet::Run() { return false; } } + return true; } unique_ptr CreateNet(const NetDef& net_def, diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 2e5086ac222a70503bf655ff9d92557369beccb4..a755577b65b7d3c5c80dd1da50b6dd4d256bccf8 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -6,8 +6,8 @@ namespace mace { -std::map* gDeviceTypeRegistry() { - static std::map g_device_type_registry; +std::map* gDeviceTypeRegistry() { + static std::map g_device_type_registry; return &g_device_type_registry; } diff --git a/mace/core/operator.h b/mace/core/operator.h index fc883855f4e7eb6f72a6c9340fb54584bcad3afe..df488691bac3fd8de6aea9c98a7175f80b50f41e 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -44,7 +44,7 @@ class OperatorBase { *operator_def_, name, default_value); } - inline const Tensor *Input(int idx) { + inline const Tensor *Input(index_t idx) { MACE_CHECK(idx < inputs_.size()); return inputs_[idx]; } diff --git a/mace/core/serializer.cc b/mace/core/serializer.cc index 310e76299db02eb8dae9fa2032f65a5cccd1c6e2..3e80e545b2a0aa23eb26906f588c9713beba046e 100644 --- a/mace/core/serializer.cc +++ b/mace/core/serializer.cc @@ -17,8 +17,8 @@ unique_ptr Serializer::Deserialize(const TensorProto &proto, DeviceType type) { unique_ptr tensor(new Tensor(GetDeviceAllocator(type), proto.data_type())); - vector dims; - for (const TIndex d : proto.dims()) { + vector dims; + for (const index_t d : proto.dims()) { dims.push_back(d); } tensor->Resize(dims); @@ -33,31 +33,31 @@ unique_ptr Serializer::Deserialize(const TensorProto &proto, proto.double_data().size()); break; case DT_INT32: - tensor->template Copy(proto.int32_data().data(), + tensor->template Copy(proto.int32_data().data(), proto.int32_data().size()); break; case DT_UINT8: - tensor->CopyWithCast(proto.int32_data().data(), + tensor->CopyWithCast(proto.int32_data().data(), proto.int32_data().size()); break; case DT_INT16: - tensor->CopyWithCast(proto.int32_data().data(), + tensor->CopyWithCast(proto.int32_data().data(), proto.int32_data().size()); break; case DT_INT8: - tensor->CopyWithCast(proto.int32_data().data(), + tensor->CopyWithCast(proto.int32_data().data(), proto.int32_data().size()); break; case DT_INT64: - tensor->Copy(proto.int64_data().data(), + tensor->Copy(proto.int64_data().data(), proto.int64_data().size()); break; case DT_UINT16: - tensor->CopyWithCast(proto.int32_data().data(), + tensor->CopyWithCast(proto.int32_data().data(), proto.int32_data().size()); break; case DT_BOOL: - tensor->CopyWithCast(proto.int32_data().data(), + tensor->CopyWithCast(proto.int32_data().data(), proto.int32_data().size()); break; case DT_STRING: { diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 7aea8b1567d74d9d5c4e8f3f809de174e599e612..1af32d3f2338f344332375d9cb67cbe23a4f119d 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -25,13 +25,13 @@ namespace mace { switch (TYPE_ENUM) { \ CASE(float, SINGLE_ARG(STMTS)) \ CASE(double, SINGLE_ARG(STMTS)) \ - CASE(int32, SINGLE_ARG(STMTS)) \ - CASE(uint8, SINGLE_ARG(STMTS)) \ - CASE(uint16, SINGLE_ARG(STMTS)) \ - CASE(int16, SINGLE_ARG(STMTS)) \ - CASE(int8, SINGLE_ARG(STMTS)) \ + CASE(int32_t, SINGLE_ARG(STMTS)) \ + CASE(uint8_t, SINGLE_ARG(STMTS)) \ + CASE(uint16_t, SINGLE_ARG(STMTS)) \ + CASE(int16_t, SINGLE_ARG(STMTS)) \ + CASE(int8_t, SINGLE_ARG(STMTS)) \ CASE(string, SINGLE_ARG(STMTS)) \ - CASE(int64, SINGLE_ARG(STMTS)) \ + CASE(int64_t, SINGLE_ARG(STMTS)) \ CASE(bool, SINGLE_ARG(STMTS)) \ case DT_INVALID: \ INVALID; \ @@ -64,11 +64,17 @@ class Tensor { inline DataType dtype() const { return dtype_; } - inline const vector& shape() const { return shape_; } + inline const vector& shape() const { return shape_; } - inline TIndex dim_size() { return shape_.size(); } + inline index_t dim_size() const { return shape_.size(); } - inline TIndex size() const { return size_; } + inline index_t dim(index_t index) const { + MACE_CHECK(index < shape_.size(), "Exceeding ndim limit"); + MACE_CHECK(index >= 0, "Cannot have negative dimension index"); + return shape_[index]; + } + + inline index_t size() const { return size_; } inline const void* raw_data() const { MACE_CHECK(data_.get() || size_ == 0); @@ -102,9 +108,9 @@ class Tensor { return static_cast(raw_mutable_data()); } - inline void Resize(const vector& shape) { + inline void Resize(const vector& shape) { shape_ = shape; - TIndex size = NumElements(); + index_t size = NumElements(); if (size_ != size) { size_ = size; data_.reset(); @@ -120,16 +126,16 @@ class Tensor { } template - inline void Copy(const T* src, size_t size) { + inline void Copy(const T* src, index_t size) { MACE_CHECK(size == size_, "copy src and dst with different size."); CopyBytes(static_cast(src), sizeof(T) * size); } template inline void CopyWithCast(const SrcType* src, size_t size) { - MACE_CHECK(size == size_, "copy src and dst with different size."); + MACE_CHECK(static_cast(size) == size_, "copy src and dst with different size."); unique_ptr buffer(new DstType[size]); - for (int i = 0; i < size; ++i) { + for (size_t i = 0; i < size; ++i) { buffer[i] = static_cast(src[i]); } CopyBytes(static_cast(buffer.get()), sizeof(DstType) * size); @@ -155,15 +161,15 @@ class Tensor { } private: - inline int64 NumElements() const { - return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); + inline int64_t NumElements() const { + return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); } Allocator* alloc_; - TIndex size_; + index_t size_; DataType dtype_; std::shared_ptr data_; - vector shape_; + vector shape_; }; } // namespace tensor diff --git a/mace/core/testing/env_time.h b/mace/core/testing/env_time.h index 6be189a658ab489fdf59fcc4f666c71574ad468b..f07783c1f66e4551886276e30796001ae1fc1a52 100644 --- a/mace/core/testing/env_time.h +++ b/mace/core/testing/env_time.h @@ -16,10 +16,10 @@ namespace mace { namespace testing { -inline int64 NowMicros() { +inline int64_t NowMicros() { struct timeval tv; gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; } } // namespace testing diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc index 1eb976ec998833ffcf436dcf4744566d460f168f..885a9a63f70956428008291f29dc293245c7d37a 100644 --- a/mace/core/testing/test_benchmark.cc +++ b/mace/core/testing/test_benchmark.cc @@ -16,10 +16,10 @@ namespace testing { static std::vector* all_benchmarks = nullptr; static std::string label; -static int64 bytes_processed; -static int64 items_processed; -static int64 accum_time = 0; -static int64 start_time = 0; +static int64_t bytes_processed; +static int64_t items_processed; +static int64_t accum_time = 0; +static int64_t start_time = 0; Benchmark::Benchmark(const char* name, void (*fn)(int)) : name_(name), num_args_(0), fn0_(fn) { @@ -112,10 +112,10 @@ void Benchmark::Register() { } void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) { - static const int64 kMinIters = 100; - static const int64 kMaxIters = 1000000000; + static const int64_t kMinIters = 100; + static const int64_t kMaxIters = 1000000000; static const double kMinTime = 0.5; - int64 iters = kMinIters; + int64_t iters = kMinIters; while (true) { accum_time = 0; start_time = NowMicros(); @@ -142,13 +142,13 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) { double multiplier = 1.4 * kMinTime / std::max(seconds, 1e-9); multiplier = std::min(10.0, multiplier); if (multiplier <= 1.0) multiplier *= 2.0; - iters = std::max(multiplier * iters, iters + 1); + iters = std::max(multiplier * iters, iters + 1); iters = std::min(iters, kMaxIters); } } -void BytesProcessed(int64 n) { bytes_processed = n; } -void ItemsProcessed(int64 n) { items_processed = n; } +void BytesProcessed(int64_t n) { bytes_processed = n; } +void ItemsProcessed(int64_t n) { items_processed = n; } void StartTiming() { if (start_time == 0) start_time = NowMicros(); } diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index 44a352f54df4cae609b3955eb5343c9b78d34126..5800f5edb0912899b09fc95ebebb8a741e2a48e1 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -42,8 +42,8 @@ class Benchmark { }; void RunBenchmarks(); -void BytesProcessed(int64); -void ItemsProcessed(int64); +void BytesProcessed(int64_t); +void ItemsProcessed(int64_t); void StartTiming(); void StopTiming(); diff --git a/mace/core/types.h b/mace/core/types.h index 161be5a7103a9c8c69be3932b9997e7dbee51124..b174993d024587875d6b597cdcb7a19f9d79d154 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -42,16 +42,16 @@ struct EnumToDataType {}; // Specializations below MATCH_TYPE_AND_ENUM(float, DT_FLOAT); MATCH_TYPE_AND_ENUM(double, DT_DOUBLE); -MATCH_TYPE_AND_ENUM(int32, DT_INT32); -MATCH_TYPE_AND_ENUM(uint16, DT_UINT16); -MATCH_TYPE_AND_ENUM(uint8, DT_UINT8); -MATCH_TYPE_AND_ENUM(int16, DT_INT16); -MATCH_TYPE_AND_ENUM(int8, DT_INT8); +MATCH_TYPE_AND_ENUM(int32_t, DT_INT32); +MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16); +MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8); +MATCH_TYPE_AND_ENUM(int16_t, DT_INT16); +MATCH_TYPE_AND_ENUM(int8_t, DT_INT8); MATCH_TYPE_AND_ENUM(string, DT_STRING); -MATCH_TYPE_AND_ENUM(int64, DT_INT64); +MATCH_TYPE_AND_ENUM(int64_t, DT_INT64); MATCH_TYPE_AND_ENUM(bool, DT_BOOL); -static const int32 kint32max = ((int32)0x7FFFFFFF); +static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF); } // namespace mace diff --git a/mace/examples/BUILD b/mace/examples/BUILD index 4f4a7794e0cb00f7d8312299dd7572afd74e68d6..82915d74e05e02c0bbddc04163d7c4e53f12f22b 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -7,10 +7,6 @@ cc_binary( "helloworld.cc", ], copts = ["-std=c++11"], - linkopts = if_android([ - "-pie", - "-llog", - ]), deps = [ "//mace/core", "//mace/ops", @@ -21,10 +17,6 @@ cc_test( name = "benchmark_example", srcs = ["benchmark_example.cc"], copts = ["-std=c++11"], - linkopts = if_android([ - "-pie", - "-llog", - ]), linkstatic = 1, deps = [ "//mace/core", diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc index 106c6c3c4ccdf7dcc091a6c9f9bbc8c0c15d2611..50e5184b0bf384d81932466584cdbc688db40a21 100644 --- a/mace/examples/benchmark_example.cc +++ b/mace/examples/benchmark_example.cc @@ -6,7 +6,7 @@ static void foo(int iters) { static const int N = 32; - const int64 tot = static_cast(iters) * N; + const int64_t tot = static_cast(iters) * N; mace::testing::ItemsProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); @@ -26,7 +26,7 @@ BENCHMARK(foo); static void bar(int iters, int n) { - const int64 tot = static_cast(iters) * n; + const int64_t tot = static_cast(iters) * n; mace::testing::ItemsProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index de8293e35421ac29031db5281d162de3999efb78..098e80a949d457406730e0f0146b45c03b75faee 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -18,6 +18,9 @@ cc_library( "//mace/core:core", ], copts = ['-std=c++11'], + linkopts = ["-fopenmp"] + if_android([ + "-lm", + ]), ) cc_test( @@ -29,11 +32,9 @@ cc_test( "//mace/core:core", ], copts = ['-std=c++11'], - linkopts = ["-fopenmp"] + if_android([ + linkopts = if_android([ "-pie", - "-llog", - "-lm", - ]), + ]), linkstatic = 1, testonly = 1, ) @@ -47,11 +48,6 @@ cc_test( "//mace/core:test_benchmark_main", ], copts = ['-std=c++11'], - linkopts = ["-fopenmp"] + if_android([ - "-pie", - "-llog", - "-lm", - ]), linkstatic = 1, testonly = 1, ) diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 3f79ac69b468c623acbe2cdb6d9179bbe3906bda..30648eb8a15186198ec8b2c9fb98c04695bf4366 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -15,7 +15,7 @@ void AddNFuntion(const vector& input_tensor, Tensor *output_tenso int n = input_tensor.size(); MACE_CHECK(n > 1); MACE_CHECK_NOTNULL(input_tensor[0]); - int64 size = input_tensor[0]->size(); + int64_t size = input_tensor[0]->size(); vector inputs(n); for (int i = 0; i < n; ++i) { inputs[i] = input_tensor[i]->data(); @@ -24,7 +24,7 @@ void AddNFuntion(const vector& input_tensor, Tensor *output_tenso T* output = output_tensor->mutable_data(); for (int i = 0; i < n; ++i) { - for (int64 j = 0; j < size; ++j) { + for (int64_t j = 0; j < size; ++j) { output[j] += inputs[i][j]; } } diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..84ca48d4a76bc477258ce0d9ec152d5f313709a9 --- /dev/null +++ b/mace/kernels/batch_norm.h @@ -0,0 +1,66 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_BATCH_NORM_H_ +#define MACE_KERNELS_BATCH_NORM_H_ + +#include "mace/core/tensor.h" +#include "mace/proto/mace.pb.h" + +namespace mace { +namespace kernels { + +template +struct BatchNormFunctorBase { + BatchNormFunctorBase(const float variance_epsilon) + :variance_epsilon_(variance_epsilon){} + + float variance_epsilon_; +}; + + +template +struct BatchNormFunctor : public BatchNormFunctorBase { + BatchNormFunctor(const float variance_epsilon) + :BatchNormFunctorBase(variance_epsilon){} + + void operator()(const T* input, + const T* scale, + const T* offset, + const T* mean, + const T* var, + const index_t n, + const index_t channel, + const index_t sample_size, + T* output) { + // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . + // The calculation formula for inference is + // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + + // ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} } + // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } + // new_offset = \offset - mean * common_val; + // Y = new_scale * X + new_offset; + T new_scale, new_offset; + for (index_t c = 0; c < channel; ++c) { + new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_); + new_offset = offset[c] - mean[c] * new_scale; + index_t pos = c * sample_size; + + for (index_t i = 0; i < n; ++i) { + const T* input_sample_ptr = input + pos; + T* output_sample_ptr = output + pos; + for (index_t j = 0; j < sample_size; ++j) { + output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset; + } + pos += channel * sample_size; + } + } + } + +}; + +} // namepsace kernels +} // namespace mace + +#endif // MACE_KERNELS_BATCH_NORM_H_ diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc index f63fed77b11847f3aacca8291c333699b0bd840a..4cec0270dbc6e1d9f55eb0db404965d9d1f1088e 100644 --- a/mace/kernels/benchmark/addn_benchmark.cc +++ b/mace/kernels/benchmark/addn_benchmark.cc @@ -11,7 +11,7 @@ using namespace mace; using namespace mace::kernels; static void AddNBenchmark(int iters, int n, int type) { - const int64 tot = static_cast(iters) * n * 3; + const int64_t tot = static_cast(iters) * n * 3; mace::testing::ItemsProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); @@ -35,7 +35,7 @@ static void AddNBenchmark(int iters, int n, int type) { float *input3 = input_tensor3.mutable_data(); float *output = output_tensor.mutable_data(); - for (int64 i = 0; i < n; ++i) { + for (int64_t i = 0; i < n; ++i) { input1[i] = nd(gen); input2[i] = nd(gen); input3[i] = nd(gen); diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc index 9276cadc737bba60a0fac81893dd5aa797d3f6a9..86858681ca29518f6ed98e46f58794d82c984057 100644 --- a/mace/kernels/benchmark/relu_benchmark.cc +++ b/mace/kernels/benchmark/relu_benchmark.cc @@ -11,7 +11,7 @@ using namespace mace; using namespace mace::kernels; static void ReluBenchmark(int iters, int n, int type) { - const int64 tot = static_cast(iters) * n; + const int64_t tot = static_cast(iters) * n; mace::testing::ItemsProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); @@ -25,7 +25,7 @@ static void ReluBenchmark(int iters, int n, int type) { output_tensor.ResizeLike(input_tensor); float *input = input_tensor.mutable_data(); float *output = output_tensor.mutable_data(); - for (int64 i = 0; i < n; ++i) { + for (int64_t i = 0; i < n; ++i) { input[i] = nd(gen); } diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index 3baab3c3b3dadb8570e0f7b4830fd9c14c1799fa..ad6f06e8df7c17dc189316a20be3be5586a212e6 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -14,7 +14,7 @@ void NeonAddNFuntion_float(const vector &input_tensor, int n = input_tensor.size(); MACE_CHECK(n > 1); MACE_CHECK_NOTNULL(input_tensor[0]); - int64 size = input_tensor[0]->size(); + int64_t size = input_tensor[0]->size(); output_tensor->ResizeLike(input_tensor[0]); float *output = output_tensor->mutable_data(); vector inputs(n); @@ -22,19 +22,19 @@ void NeonAddNFuntion_float(const vector &input_tensor, inputs[i] = input_tensor[i]->data(); } - int64 cost = size * n; - int64 groups = 1; + int64_t cost = size * n; + int64_t groups = 1; if (cost > kCostPerGroup) { groups = cost / kCostPerGroup; } - int64 element_per_group = size / groups; + int64_t element_per_group = size / groups; #pragma omp parallel for num_threads(1) // no significant performance improve - for (int64 i = 0; i < size; i += element_per_group) { - int64 count = std::min(element_per_group, size - i); + for (int64_t i = 0; i < size; i += element_per_group) { + int64_t count = std::min(element_per_group, size - i); int nn = count >> 2; int remain = count - (nn << 2); - for (int64 j = 0; j < n; ++j) { + for (int64_t j = 0; j < n; ++j) { const float *inptr = inputs[j] + i; float *outptr = output + i; for (int k = 0; k < nn; ++k) { diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc new file mode 100644 index 0000000000000000000000000000000000000000..a306fdbc804e0c5995846fa89dd5bb681d31e1ed --- /dev/null +++ b/mace/kernels/neon/batch_norm_neon.cc @@ -0,0 +1,69 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#if __ARM_NEON +#include +#include "mace/kernels/batch_norm.h" + +namespace mace { +namespace kernels { + +template +struct BatchNormFunctor : public BatchNormFunctorBase { + BatchNormFunctor(const float variance_epsilon) + :BatchNormFunctorBase(variance_epsilon){} + + void operator()(const T* input, + const T* scale, + const T* offset, + const T* mean, + const T* var, + const int n, + const int channel, + const int sample_size, + T* output) { + + // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . + // The calculation formula for inference is + // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + + // ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} } + // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } + // new_offset = \offset - mean * common_val; + // Y = new_scale * X + new_offset; + T new_scale, new_offset; + int count = sample_size >> 2; + int remain_count = sample_size - count; + for (index_t c = 0; c < channel; ++c) { + new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_); + new_offset = offset[c] - mean[c] * new_scale; + index_t pos = c * sample_size; + + float32x4_t new_scale_f = vdupq_n_f32(new_scale); + float32x4_t new_offset_f = vdupq_n_f32(new_offset); + for (index_t i = 0; i < n; ++i) { + const float* input_sample_ptr = input + pos; + float* output_sample_ptr = output + pos; + + for(index_t j = 0; j < count; ++j) { + float32x4_t input_f = vld1q_f32(input_sample_ptr); + float32x4_t output_f = new_offset_f; + output_f = vfmaq_f32(output_f, input_f, new_scale_f); + vst1q_f32(output_sample_ptr, output_f); + input_sample_ptr += 4; + output_sample_ptr += 4; + } + for(index_t j = 0; j < remain_count; ++j) { + *output_sample_ptr = new_scale * *input_sample_ptr + new_offset; + ++output_sample_ptr; + ++input_sample_ptr; + } + pos += channel * sample_size; + } + } + } +}; + +} // namespace kernels +} // namespace mace +#endif // __ARM_NEON diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index 29c4e354a783f945f25e25ce9de75b879776f737..e487081891ab7d6bd7a039c5429442d0a7641d1a 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -10,14 +10,14 @@ namespace kernels { void NeonReluFuntion_float(const Tensor *input_tensor, Tensor *output_tensor) { - int64 size = input_tensor->size(); + int64_t size = input_tensor->size(); output_tensor->ResizeLike(input_tensor); const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); #pragma omp parallel for num_threads(1) // no significant performance improve - for (int64 i = 0; i < size; i += kCostPerGroup) { - int64 count = std::min(static_cast(kCostPerGroup), size - i); + for (int64_t i = 0; i < size; i += kCostPerGroup) { + int64_t count = std::min(static_cast(kCostPerGroup), size - i); int nn = count >> 2; int remain = count - (nn << 2); const float *inptr = input + i; diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h index 086f762b41e85c3ff7042086ba1b56d3607d30c2..d0de2f0b061524537479c9082ca250fba47e6c29 100644 --- a/mace/kernels/relu.h +++ b/mace/kernels/relu.h @@ -12,12 +12,12 @@ namespace kernels { template void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { - int64 size = input_tensor->size(); + int64_t size = input_tensor->size(); output_tensor->ResizeLike(input_tensor); const T *input = input_tensor->data(); T *output = output_tensor->mutable_data(); - for (int64 i = 0; i < size; ++i) { + for (int64_t i = 0; i < size; ++i) { output[i] = std::max(input[i], static_cast(0)); } } diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc index 8d1ca924b9b3ef8fecf96301007afa593cd54600..521fe9129b64a1e8f646c2124b3f56de32af677a 100644 --- a/mace/kernels/test/addn_neon_test.cc +++ b/mace/kernels/test/addn_neon_test.cc @@ -15,7 +15,7 @@ TEST(NeonTest, AddN) { std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - int64 count = 100000; + int64_t count = 100000; Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); input_tensor1.Resize({100, 1000}); Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); @@ -37,7 +37,7 @@ TEST(NeonTest, AddN) { float *output = output_tensor.mutable_data(); float *output_neon = output_tensor_neon.mutable_data(); - for (int64 i = 0; i < count; ++i) { + for (int64_t i = 0; i < count; ++i) { input1[i] = nd(gen); input2[i] = nd(gen); input3[i] = nd(gen); @@ -48,7 +48,7 @@ TEST(NeonTest, AddN) { ASSERT_EQ(count, output_tensor.size()); ASSERT_EQ(count, output_tensor_neon.size()); - for (int64 i = 0; i < count; ++i) { + for (int64_t i = 0; i < count; ++i) { ASSERT_FLOAT_EQ(output[i], output_neon[i]); } } diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc index 40c1bc62d68a94820ac99d1140203c24dd412235..a16dc2692501017a494d25d5af9dab73be8c44db 100644 --- a/mace/kernels/test/relu_neon_test.cc +++ b/mace/kernels/test/relu_neon_test.cc @@ -15,7 +15,7 @@ TEST(NeonTest, Relu) { std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - int64 count = 100000; + int64_t count = 100000; Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); input_tensor.Resize({100, 1000}); Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); @@ -27,7 +27,7 @@ TEST(NeonTest, Relu) { float *output = output_tensor.mutable_data(); float *output_neon = output_tensor_neon.mutable_data(); - for (int64 i = 0; i < count; ++i) { + for (int64_t i = 0; i < count; ++i) { input[i] = nd(gen); } @@ -36,7 +36,7 @@ TEST(NeonTest, Relu) { ASSERT_EQ(count, output_tensor.size()); ASSERT_EQ(count, output_tensor_neon.size()); - for (int64 i = 0; i < count; ++i) { + for (int64_t i = 0; i < count; ++i) { ASSERT_FLOAT_EQ(output[i], output_neon[i]); } } diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 09a00ab3445025528a82129704be9005161dc141..3c09daa38d9e9af7f2bc5bd8e139ed61ec8dab81 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -9,10 +9,28 @@ licenses(["notice"]) # Apache 2.0 load("//mace:mace.bzl", "if_android") +cc_library( + name = "test", + testonly = 1, + hdrs = [ + "ops_test_util.h", + ], + deps = [ + "//mace/core", + "@gtest//:gtest", + ], +) + cc_library( name = "ops", - srcs = glob(["*.cc"]), - hdrs = glob(["*.h"]), + srcs = glob( + ["*.cc"], + exclude = ["*_test.cc"], + ), + hdrs = glob( + ["*.h"], + exclude = ["ops_test_util.h"], + ), copts = ["-std=c++11"], deps = [ "//mace/core", @@ -53,3 +71,15 @@ cc_test( "@gtest//:gtest_main", ], ) + +cc_test( + name = "batch_norm_test", + srcs = ["batch_norm_test.cc"], + copts = ["-std=c++11"], + linkstatic = 1, + deps = [ + ":ops", + ":test", + "@gtest//:gtest_main", + ], +) diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a48b669f5a1f97f9a15059cf76331740e7f943c --- /dev/null +++ b/mace/ops/batch_norm.cc @@ -0,0 +1,15 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/batch_norm.h" + +namespace mace { + +REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp); + +#if __ARM_NEON +REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp); +#endif // __ARM_NEON + +} // namespace mace \ No newline at end of file diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..59c227c865b519b81c7e6d818a052336acd2e570 --- /dev/null +++ b/mace/ops/batch_norm.h @@ -0,0 +1,59 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_BATCH_NORM_H_ +#define MACE_BATCH_NORM_H_ + +#include "mace/core/operator.h" +#include "mace/kernels/batch_norm.h" + +namespace mace { + +template +class BatchNormOp : public Operator { + public: + BatchNormOp(const OperatorDef &operator_def, Workspace *ws) + : Operator(operator_def, ws), + functor_(OperatorBase::GetSingleArgument("variance_epsilon", 1e-4)){} + + bool Run() override { + const Tensor* input = this->Input(0); + const Tensor* scale = this->Input(1); + const Tensor* offset = this->Input(2); + const Tensor* mean = this->Input(3); + const Tensor* var = this->Input(4); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", input->dim_size()); + MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", scale->dim_size()); + MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", offset->dim_size()); + MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", mean->dim_size()); + MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", var->dim_size()); + + Tensor* output = this->Output(0); + output->ResizeLike(input); + + const index_t n = input->dim(0); + const index_t channel = input->dim(1); + const index_t sample_size = input->dim(2) * input->dim(3); + + const float* input_ptr = input->data(); + const float* scale_ptr = scale->data(); + const float* offset_ptr = offset->data(); + const float* mean_ptr = mean->data(); + const float* var_ptr = var->data(); + float* output_ptr = output->mutable_data(); + + functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, + n, channel, sample_size, + output_ptr); + return true; + } + private: + kernels::BatchNormFunctor functor_; + +}; + +} // namespace mace + +#endif // MACE_BATCH_NORM_H_ diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b52d0590dbdb77b1ef8c5a35215b7c6a9582ef1 --- /dev/null +++ b/mace/ops/batch_norm_test.cc @@ -0,0 +1,46 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { + +class BatchNormOpTest : public OpsTestBase {}; + +TEST_F(BatchNormOpTest, Simple) { + // Construct graph + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Output("Output") + .Finalize(operator_def()); + + // Add input data + AddInputFromArray("Input", {1, 1, 6, 2}, + {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); + AddInputFromArray("Scale", {2}, + {4.0f, 4.0f}); + AddInputFromArray("Offset", {2}, + {2.0, 2.0}); + AddInputFromArray("Mean", {2}, + {10, 10}); + AddInputFromArray("Var", {2}, + {11.67f, 11.67f}); + + // Run + RunOp(); + + // Check + Tensor expected = CreateTensor({1, 1, 6, 2}, + {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, + 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); + + ExpectTensorNear(expected, *GetOutput("Output"), 0.01); +} + +} diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0e96943c60085014bc01c65323882ebc0480249e --- /dev/null +++ b/mace/ops/ops_test_util.h @@ -0,0 +1,170 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_TEST_UTIL_H_ +#define MACE_OPS_TEST_UTIL_H_ + +#include "gtest/gtest.h" +#include "mace/core/common.h" +#include "mace/core/tensor.h" +#include "mace/core/net.h" + +namespace mace { + +class OpDefBuilder { + public: + OpDefBuilder(const char* type, const char* name) { + op_def_.set_type(type); + op_def_.set_name(name); + } + OpDefBuilder& Input(const char* input_name) { + op_def_.add_input(input_name); + return *this; + } + OpDefBuilder& Output(const char* output_name) { + op_def_.add_output(output_name); + return *this; + } + void Finalize(OperatorDef* op_def) const { + MACE_CHECK(op_def != NULL, "input should not be null."); + *op_def = op_def_; + } + OperatorDef op_def_; +}; + +class OpsTestBase : public ::testing::Test { + protected: + virtual void TearDown() { + auto tensor_names = ws_.Tensors(); + for (auto& name : tensor_names) { + ws_.RemoveTensor(name); + } + } + public: + template + void AddInputFromArray(const char* name, const std::vector& shape, const std::vector& data) { + Tensor* input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum::v()); + input->Resize(shape); + float* input_data = input->mutable_data(); + memcpy(input_data, data.data(), data.size() * sizeof(T)); + } + + OperatorDef* operator_def() { return &op_def_; } + + bool RunOp() { + NetDef net_def; + net_def.add_op()->CopyFrom(op_def_); + VLOG(0) << net_def.DebugString(); + auto net = CreateNet(net_def, &ws_, DeviceType::CPU); + return net->Run(); + } + + Tensor* GetOutput(const char* output_name) { + return ws_.GetTensor(output_name); + } + + private: + Workspace ws_; + OperatorDef op_def_; +}; + +template +Tensor CreateTensor(const std::vector& shape, const std::vector& data) { + Tensor res(cpu_allocator(), DataTypeToEnum::v()); + res.Resize(shape); + float* input_data = res.mutable_data(); + memcpy(input_data, data.data(), data.size() * sizeof(T)); + return res; +} + +inline bool IsSameSize(const Tensor& x, const Tensor& y) { + if (x.dim_size() != y.dim_size()) return false; + for (int d = 0; d < x.dim_size(); ++d) { + if (x.dim(d) != y.dim(d)) return false; + } + return true; +} + +inline std::string ShapeToString(const Tensor& x) { + std::stringstream stream; + for (int i = 0; i < x.dim_size(); i++) { + if (i > 0) stream<<","; + int64_t dim = x.dim(i); + if (dim < 0) { + stream<<"?"; + } else { + stream< +struct is_floating_point_type { + static const bool value = std::is_same::value || + std::is_same::value; +}; + +template +inline void ExpectEqual(const T& a, const T& b) { + EXPECT_EQ(a, b); +} + +template <> +inline void ExpectEqual(const float& a, const float& b) { + EXPECT_FLOAT_EQ(a, b); +} + +template <> +inline void ExpectEqual(const double& a, const double& b) { + EXPECT_DOUBLE_EQ(a, b); +} + +inline void AssertSameTypeDims(const Tensor& x, const Tensor& y) { + ASSERT_EQ(x.dtype(), y.dtype()); + ASSERT_TRUE(IsSameSize(x, y)) + << "x.shape [" << ShapeToString(x) << "] vs " + << "y.shape [ " << ShapeToString(y) << "]"; +} + +template ::value> +struct Expector; +// Partial specialization for float and double. +template +struct Expector { + static void Equal(const T& a, const T& b) { ExpectEqual(a, b); } + + static void Equal(const Tensor& x, const Tensor& y) { + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + AssertSameTypeDims(x, y); + auto a = x.data(); + auto b = y.data(); + for (int i = 0; i < x.size(); ++i) { + ExpectEqual(a(i), b(i)); + } + } + + static void Near(const Tensor& x, const Tensor& y, const double abs_err) { + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + AssertSameTypeDims(x, y); + auto a = x.data(); + auto b = y.data(); + for (int i = 0; i < x.size(); ++i) { + EXPECT_NEAR(a[i], b[i], abs_err) + << "a = " << a << " b = " << b << " index = " << i; + } + } +}; + +template +void ExpectTensorNear(const Tensor& x, const Tensor& y, const double abs_err) { + static_assert(is_floating_point_type::value, "T is not a floating point type"); + Expector::Near(x, y ,abs_err); +} + +} // namespace mace + +#endif // MACE_OPS_TEST_UTIL_H_