Add anonymous namespace for local variables and functions.

c63ba922 · liuqi · 13b9e55c · c63ba922 · c63ba922 · c63ba922
59 changed file
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -28,7 +28,7 @@ ArgumentHelper::ArgumentHelper(const NetDef &netdef) {
  }
 }

-bool ArgumentHelper::HasArgument(const string &name) const {
+bool ArgumentHelper::HasArgument(const std::string &name) const {
  return arg_map_.count(name);
 }

@@ -44,7 +44,7 @@ bool SupportsLosslessConversion(const InputType &value) {
 #define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname,                         \
                                        enforce_lossless_conversion)          \
  template <>                                                                 \
-  T ArgumentHelper::GetSingleArgument<T>(const string &name,                  \
+  T ArgumentHelper::GetSingleArgument<T>(const std::string &name,             \
                                         const T &default_value) const {      \
    if (arg_map_.count(name) == 0) {                                          \
      VLOG(3) << "Using default parameter value " << default_value            \
@@ -63,7 +63,8 @@ bool SupportsLosslessConversion(const InputType &value) {
    return value;                                                             \
  }                                                                           \
  template <>                                                                 \
-  bool ArgumentHelper::HasSingleArgumentOfType<T>(const string &name) const { \
+  bool ArgumentHelper::HasSingleArgumentOfType<T>(                            \
+      const std::string &name) const {                                        \
    if (arg_map_.count(name) == 0) {                                          \
      return false;                                                           \
    }                                                                         \
@@ -80,28 +81,28 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(uint8_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(uint16_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(std::string, s, false)
 #undef INSTANTIATE_GET_SINGLE_ARGUMENT

-#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                   \
-                                          enforce_lossless_conversion)    \
-  template <>                                                             \
-  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                  \
-      const string &name, const std::vector<T> &default_value) const {    \
-    if (arg_map_.count(name) == 0) {                                      \
-      return default_value;                                               \
-    }                                                                     \
-    std::vector<T> values;                                                \
-    for (const auto &v : arg_map_.at(name).fieldname()) {                 \
-      if (enforce_lossless_conversion) {                                  \
-        auto supportsConversion =                                         \
-            SupportsLosslessConversion<decltype(v), T>(v);                \
-        MACE_CHECK(supportsConversion, "Value", v, " of argument ", name, \
-                   "cannot be represented correctly in a target type");   \
-      }                                                                   \
-      values.push_back(v);                                                \
-    }                                                                     \
-    return values;                                                        \
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                     \
+                                          enforce_lossless_conversion)      \
+  template <>                                                               \
+  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                    \
+      const std::string &name, const std::vector<T> &default_value) const { \
+    if (arg_map_.count(name) == 0) {                                        \
+      return default_value;                                                 \
+    }                                                                       \
+    std::vector<T> values;                                                  \
+    for (const auto &v : arg_map_.at(name).fieldname()) {                   \
+      if (enforce_lossless_conversion) {                                    \
+        auto supportsConversion =                                           \
+            SupportsLosslessConversion<decltype(v), T>(v);                  \
+        MACE_CHECK(supportsConversion, "Value", v, " of argument ", name,   \
+                   "cannot be represented correctly in a target type");     \
+      }                                                                     \
+      values.push_back(v);                                                  \
+    }                                                                       \
+    return values;                                                          \
  }

 INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
@@ -114,7 +115,7 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(uint8_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(uint16_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(std::string, strings, false)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT

 }  // namespace mace
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -14,8 +14,6 @@

 namespace mace {

-using std::string;
-
 /**
 * @brief A helper class to index into arguments.
 *
@@ -27,45 +25,45 @@ using std::string;
 class ArgumentHelper {
 public:
  template <typename Def>
-  static bool HasArgument(const Def &def, const string &name) {
+  static bool HasArgument(const Def &def, const std::string &name) {
    return ArgumentHelper(def).HasArgument(name);
  }

  template <typename Def, typename T>
  static T GetSingleArgument(const Def &def,
-                             const string &name,
+                             const std::string &name,
                             const T &default_value) {
    return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
  }

  template <typename Def, typename T>
-  static bool HasSingleArgumentOfType(const Def &def, const string &name) {
+  static bool HasSingleArgumentOfType(const Def &def, const std::string &name) {
    return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
  }

  template <typename Def, typename T>
  static std::vector<T> GetRepeatedArgument(
      const Def &def,
-      const string &name,
+      const std::string &name,
      const std::vector<T> &default_value = std::vector<T>()) {
    return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
  }

  explicit ArgumentHelper(const OperatorDef &def);
  explicit ArgumentHelper(const NetDef &netdef);
-  bool HasArgument(const string &name) const;
+  bool HasArgument(const std::string &name) const;

  template <typename T>
-  T GetSingleArgument(const string &name, const T &default_value) const;
+  T GetSingleArgument(const std::string &name, const T &default_value) const;
  template <typename T>
-  bool HasSingleArgumentOfType(const string &name) const;
+  bool HasSingleArgumentOfType(const std::string &name) const;
  template <typename T>
  std::vector<T> GetRepeatedArgument(
-      const string &name,
+      const std::string &name,
      const std::vector<T> &default_value = std::vector<T>()) const;

 private:
-  std::map<string, Argument> arg_map_;
+  std::map<std::string, Argument> arg_map_;
 };

 }  // namespace mace

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -28,10 +28,10 @@ class NetBase {

  virtual bool Run(RunMetadata *run_metadata = nullptr) = 0;

-  const string &Name() const { return name_; }
+  const std::string &Name() const { return name_; }

 protected:
-  string name_;
+  std::string name_;
  const std::shared_ptr<const OperatorRegistry> op_registry_;

  DISABLE_COPY_AND_ASSIGN(NetBase);

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -25,25 +25,26 @@ class OperatorBase {
  explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
  virtual ~OperatorBase() noexcept {}

-  inline bool HasArgument(const string &name) const {
+  inline bool HasArgument(const std::string &name) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ArgumentHelper::HasArgument(*operator_def_, name);
  }
  template <typename T>
-  inline T GetSingleArgument(const string &name, const T &default_value) const {
+  inline T GetSingleArgument(const std::string &name,
+                             const T &default_value) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
        *operator_def_, name, default_value);
  }
  template <typename T>
-  inline bool HasSingleArgumentOfType(const string &name) const {
+  inline bool HasSingleArgumentOfType(const std::string &name) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
        *operator_def_, name);
  }
  template <typename T>
  inline std::vector<T> GetRepeatedArgument(
-      const string &name, const std::vector<T> &default_value = {}) const {
+      const std::string &name, const std::vector<T> &default_value = {}) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
        *operator_def_, name, default_value);
@@ -90,14 +91,14 @@ class Operator : public OperatorBase {
 public:
  explicit Operator(const OperatorDef &operator_def, Workspace *ws)
      : OperatorBase(operator_def, ws) {
-    for (const string &input_str : operator_def.input()) {
+    for (const std::string &input_str : operator_def.input()) {
      const Tensor *tensor = ws->GetTensor(input_str);
      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
                 ": Encountered a non-existing input tensor: ", input_str);
      inputs_.push_back(tensor);
    }

-    for (const string &output_str : operator_def.output()) {
+    for (const std::string &output_str : operator_def.output()) {
      if (ws->HasTensor(output_str)) {
        outputs_.push_back(ws->GetTensor(output_str));
      } else {

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -19,7 +19,7 @@
 namespace mace {
 namespace kernels {

-constexpr int kCostPerGroup = 1024;
+static constexpr int kCostPerGroup = 1024;

 template <DeviceType D, typename T>
 struct AddNFunctor {

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -11,21 +11,21 @@
 namespace mace {
 namespace kernels {

-void DepthwiseConv2d(cl::Kernel *kernel,
-                     const Tensor *input,   // NHWC
-                     const Tensor *filter,  // HWIM
-                     const Tensor *bias,
-                     const int stride,
-                     const int *paddings,
-                     const int *dilations,
-                     const ActivationType activation,
-                     const float relux_max_limit,
-                     const DataType dt,
-                     std::vector<index_t> *prev_input_shape,
-                     Tensor *output,
-                     StatsFuture *future,
-                     uint32_t *kwg_size,
-                     std::unique_ptr<BufferBase> *kernel_error) {
+static void DepthwiseConv2d(cl::Kernel *kernel,
+                            const Tensor *input,   // NHWC
+                            const Tensor *filter,  // HWIM
+                            const Tensor *bias,
+                            const int stride,
+                            const int *paddings,
+                            const int *dilations,
+                            const ActivationType activation,
+                            const float relux_max_limit,
+                            const DataType dt,
+                            std::vector<index_t> *prev_input_shape,
+                            Tensor *output,
+                            StatsFuture *future,
+                            uint32_t *kwg_size,
+                            std::unique_ptr<BufferBase> *kernel_error) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -8,6 +8,7 @@
 namespace mace {
 namespace kernels {

+namespace {
 template <typename T>
 void FCWXKernel(cl::Kernel *kernel,
                const Tensor *input,
@@ -268,6 +269,7 @@ void FCWTXKernel(cl::Kernel *kernel,
    (*kernel_error)->UnMap();
  }
 }
+}  // namespace

 template <typename T>
 void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -14,6 +14,7 @@
 namespace mace {
 namespace kernels {

+namespace {
 // [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
                           std::vector<size_t> *image_shape) {
@@ -97,6 +98,7 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
  (*image_shape)[0] = RoundUpDiv4(shape[1]);
  (*image_shape)[1] = shape[0];
 }
+}  // namespace

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,

--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -15,7 +15,7 @@
 namespace mace {
 namespace kernels {

-static std::vector<float> WHCenters(const std::vector<float> &anchor) {
+static inline std::vector<float> WHCenters(const std::vector<float> &anchor) {
  // width, height, width_center, height_center
  std::vector<float> window(4);
  window[0] = anchor[2] - anchor[0] + 1;
@@ -25,7 +25,7 @@ static std::vector<float> WHCenters(const std::vector<float> &anchor) {
  return window;
 }

-std::vector<std::vector<float>> GenerateAnchors(
+static inline std::vector<std::vector<float>> GenerateAnchors(
    const std::vector<int> &scales,
    const std::vector<float> &ratios,
    const int base_size) {
@@ -65,10 +65,10 @@ std::vector<std::vector<float>> GenerateAnchors(
  return anchors;
 }

-std::vector<int> nms(const float *bboxes_ptr,
-                     const index_t num_bboxes,
-                     const float thresh,
-                     const int post_nms_top_n) {
+static inline std::vector<int> nms(const float *bboxes_ptr,
+                                   const index_t num_bboxes,
+                                   const float thresh,
+                                   const int post_nms_top_n) {
  std::vector<int> keep;
  std::vector<int> suppressed(num_bboxes, 0);


--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -23,18 +23,19 @@ struct CachedInterpolation {
  float lerp;
 };

-inline float CalculateResizeScale(index_t in_size,
-                                  index_t out_size,
-                                  bool align_corners) {
+static inline float CalculateResizeScale(index_t in_size,
+                                         index_t out_size,
+                                         bool align_corners) {
  return (align_corners && out_size > 1)
             ? (in_size - 1) / static_cast<float>(out_size - 1)
             : in_size / static_cast<float>(out_size);
 }

-inline void ComputeInterpolationWeights(const index_t out_size,
-                                        const index_t in_size,
-                                        const float scale,
-                                        CachedInterpolation *interpolation) {
+static inline void ComputeInterpolationWeights(
+    const index_t out_size,
+    const index_t in_size,
+    const float scale,
+    CachedInterpolation *interpolation) {
  interpolation[out_size].lower = 0;
  interpolation[out_size].upper = 0;
  for (index_t i = out_size - 1; i >= 0; --i) {
@@ -45,12 +46,12 @@ inline void ComputeInterpolationWeights(const index_t out_size,
  }
 }

-inline float ComputeLerp(const float top_left,
-                         const float top_right,
-                         const float bottom_left,
-                         const float bottom_right,
-                         const float x_lerp,
-                         const float y_lerp) {
+static inline float ComputeLerp(const float top_left,
+                                const float top_right,
+                                const float bottom_left,
+                                const float bottom_right,
+                                const float x_lerp,
+                                const float y_lerp) {
  const float top = top_left + (top_right - top_left) * x_lerp;
  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
  return top + (bottom - top) * y_lerp;

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void ReluBenchmark(
+void ReluBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -51,6 +52,7 @@ static void ReluBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -72,8 +74,9 @@ BM_RELU(1, 3, 512, 512);
 BM_RELU(1, 32, 112, 112);
 BM_RELU(1, 64, 256, 256);

+namespace {
 template <DeviceType D, typename T>
-static void ReluxBenchmark(
+void ReluxBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -113,6 +116,7 @@ static void ReluxBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -134,8 +138,9 @@ BM_RELUX(1, 3, 512, 512);
 BM_RELUX(1, 32, 112, 112);
 BM_RELUX(1, 64, 256, 256);

+namespace {
 template <DeviceType D, typename T>
-static void PreluBenchmark(
+void PreluBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -178,6 +183,7 @@ static void PreluBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -199,8 +205,9 @@ BM_PRELU(1, 3, 512, 512);
 BM_PRELU(1, 32, 112, 112);
 BM_PRELU(1, 64, 256, 256);

+namespace {
 template <DeviceType D, typename T>
-static void TanhBenchmark(
+void TanhBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -238,6 +245,7 @@ static void TanhBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -259,8 +267,9 @@ BM_TANH(1, 3, 512, 512);
 BM_TANH(1, 32, 112, 112);
 BM_TANH(1, 64, 256, 256);

+namespace {
 template <DeviceType D, typename T>
-static void SigmoidBenchmark(
+void SigmoidBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -298,6 +307,7 @@ static void SigmoidBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE)                   \
  static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class ActivationOpTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void TestSimpleRelu() {
  OpsTestNet net;
@@ -52,6 +53,7 @@ void TestSimpleRelu() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }

@@ -59,6 +61,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
  TestSimpleRelu<DeviceType::OPENCL>();
 }

+namespace {
 template <DeviceType D>
 void TestUnalignedSimpleRelu() {
  OpsTestNet net;
@@ -97,6 +100,7 @@ void TestUnalignedSimpleRelu() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
  TestUnalignedSimpleRelu<DeviceType::CPU>();
@@ -106,6 +110,8 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
  TestUnalignedSimpleRelu<DeviceType::OPENCL>();
 }

+
+namespace {
 template <DeviceType D>
 void TestSimpleRelux() {
  OpsTestNet net;
@@ -149,6 +155,7 @@ void TestSimpleRelux() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }

@@ -156,6 +163,7 @@ TEST_F(ActivationOpTest, OPENCLSimple) {
  TestSimpleRelux<DeviceType::OPENCL>();
 }

+namespace {
 template <DeviceType D>
 void TestSimpleReluRelux() {
  OpsTestNet net;
@@ -199,6 +207,7 @@ void TestSimpleReluRelux() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimpleRelux) {
  TestSimpleReluRelux<DeviceType::CPU>();
@@ -208,6 +217,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
  TestSimpleReluRelux<DeviceType::OPENCL>();
 }

+namespace {
 template <DeviceType D>
 void TestSimplePrelu() {
  OpsTestNet net;
@@ -261,6 +271,7 @@ void TestSimplePrelu() {
    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  }
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimplePrelu) {
  TestSimplePrelu<DeviceType::CPU>();
@@ -274,6 +285,7 @@ TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
  TestSimplePrelu<DeviceType::OPENCL>();
 }

+namespace {
 template <DeviceType D>
 void TestSimpleTanh() {
  OpsTestNet net;
@@ -318,6 +330,7 @@ void TestSimpleTanh() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }

@@ -325,6 +338,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
  TestSimpleTanh<DeviceType::OPENCL>();
 }

+namespace {
 template <DeviceType D>
 void TestSimpleSigmoid() {
  OpsTestNet net;
@@ -370,6 +384,7 @@ void TestSimpleSigmoid() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
  TestSimpleSigmoid<DeviceType::CPU>();

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
+void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -57,6 +58,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    net.Sync();
  }
 }
+}  // namespace

 #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                       \
  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(   \

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class AddnOpTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void SimpleAdd2() {
  // Construct graph
@@ -32,9 +33,11 @@ void SimpleAdd2() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }

+namespace {
 template <DeviceType D>
 void SimpleAdd3() {
  // Construct graph
@@ -58,9 +61,11 @@ void SimpleAdd3() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }

+namespace {
 template <DeviceType D>
 void RandomTest() {
  testing::internal::LogToStderr();
@@ -116,6 +121,7 @@ void RandomTest() {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
  }
 }
+}  // namespace

 TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }


--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void BatchNorm(
+void BatchNorm(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -74,6 +75,7 @@ static void BatchNorm(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE)                  \
  static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class BatchNormOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D>
 void Simple() {
  OpsTestNet net;
@@ -71,6 +72,7 @@ void Simple() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace

 TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }


--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void BMBatchToSpace(
+void BMBatchToSpace(
    int iters, int batch, int channels, int height, int width, int arg) {
  mace::testing::StopTiming();

@@ -38,6 +39,7 @@ static void BMBatchToSpace(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE)             \
  static void                                                              \

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void BiasAdd(int iters, int batch, int channels, int height, int width) {
+void BiasAdd(int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -51,6 +52,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
  }
  net.Sync();
 }
+}  // namespace

 #define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE)                  \
  static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class BiasAddOpTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void BiasAddSimple() {
  OpsTestNet net;
@@ -54,6 +55,7 @@ void BiasAddSimple() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace

 TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }


--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -9,6 +9,7 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
 void TestBidirectionTransform(const int type,
                              const std::vector<index_t> &input_shape) {
@@ -40,6 +41,7 @@ void TestBidirectionTransform(const int type,
  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                      1e-5);
 }
+}  // namespace

 TEST(BufferToImageTest, ArgSmall) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
@@ -112,6 +114,7 @@ TEST(BufferToImageTest, Filter3x3Large) {
                                                      {3, 3, 128, 256});
 }

+namespace {
 template <DeviceType D, typename T>
 void TestDiffTypeBidirectionTransform(const int type,
                                      const std::vector<index_t> &input_shape) {
@@ -142,12 +145,14 @@ void TestDiffTypeBidirectionTransform(const int type,
  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                          1e-2);
 }
+}  // namespace

 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
                                                             {11});
 }

+namespace {
 template <DeviceType D, typename T>
 void TestStringHalfBidirectionTransform(const int type,
                                        const std::vector<index_t> &input_shape,
@@ -182,6 +187,7 @@ void TestStringHalfBidirectionTransform(const int type,
  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                         1e-2);
 }
+}  // namespace

 TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void ChannelShuffle(
+void ChannelShuffle(
    int iters, int batch, int channels, int height, int width, int group) {
  mace::testing::StopTiming();

@@ -48,6 +49,7 @@ static void ChannelShuffle(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
  static void                                                             \

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void ConcatHelper(int iters, int concat_dim, int dim1) {
+void ConcatHelper(int iters, int concat_dim, int dim1) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -39,6 +40,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
    net.RunOp(D);
  }
 }
+}  // namespace

 #define BM_CONCAT_CPU_MACRO(DIM0, DIM1)                      \
  static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) {     \
@@ -51,11 +53,12 @@ BM_CONCAT_CPU_MACRO(0, 100000);
 BM_CONCAT_CPU_MACRO(1, 1000);
 BM_CONCAT_CPU_MACRO(1, 100000);

+namespace {
 template <typename T>
-static void OpenclConcatHelper(int iters,
-                               const std::vector<index_t> &shape0,
-                               const std::vector<index_t> &shape1,
-                               int concat_dim) {
+void OpenclConcatHelper(int iters,
+                        const std::vector<index_t> &shape0,
+                        const std::vector<index_t> &shape1,
+                        int concat_dim) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -91,6 +94,7 @@ static void OpenclConcatHelper(int iters,
    net.RunOp(DeviceType::OPENCL);
  }
 }
+}  // namespace

 #define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                           \
  static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -144,6 +144,7 @@ TEST_F(ConcatOpTest, CPURandom) {
  }
 }

+namespace {
 template <typename T>
 void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
                      const int axis) {
@@ -208,6 +209,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
    k++;
  }
 }
+}  // namespace

 TEST_F(ConcatOpTest, OPENCLAligned) {
  OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3);

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -13,18 +13,19 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void Conv2d(int iters,
-                   int batch,
-                   int channels,
-                   int height,
-                   int width,
-                   int kernel_h,
-                   int kernel_w,
-                   int stride,
-                   int dilation,
-                   Padding padding,
-                   int output_channels) {
+void Conv2d(int iters,
+            int batch,
+            int channels,
+            int height,
+            int width,
+            int kernel_h,
+            int kernel_w,
+            int stride,
+            int dilation,
+            Padding padding,
+            int output_channels) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -88,6 +89,7 @@ static void Conv2d(int iters,
    net.Sync();
  }
 }
+}  // namespace

 // In common network, there are usually more than 1 layers, this is used to
 // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -14,6 +14,7 @@ namespace test {

 class Conv2dOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
  OpsTestNet net;
@@ -129,6 +130,7 @@ void TestNHWCSimple3x3SAME() {

  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace

 TEST_F(Conv2dOpTest, CPUSimple) {
  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
@@ -140,6 +142,7 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }

+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
  OpsTestNet net;
@@ -193,6 +196,7 @@ void TestNHWCSimple3x3WithoutBias() {

  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace

 TEST_F(Conv2dOpTest, CPUWithoutBias) {
  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
@@ -202,8 +206,9 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }

+namespace {
 template<DeviceType D, typename T>
-static void TestNHWCCombined3x3() {
+void TestNHWCCombined3x3() {
  // Construct graph
  OpsTestNet net;

@@ -263,6 +268,7 @@ static void TestNHWCCombined3x3() {
                   9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace

 TEST_F(Conv2dOpTest, CPUStride2) {
  TestNHWCCombined3x3<DeviceType::CPU, float>();
@@ -272,6 +278,7 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
 }

+namespace {
 template<DeviceType D>
 void TestConv1x1() {
  // Construct graph
@@ -340,14 +347,16 @@ void TestConv1x1() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace

 TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }

 TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }

+namespace {
 template<DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
-                                  const int stride) {
+void TestComplexConvNxNS12(const std::vector<index_t> &shape,
+                           const int stride) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -414,6 +423,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    func(kernel_size, kernel_size, stride, stride, SAME);
  }
 }
+}  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
@@ -430,10 +440,11 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
 }

+namespace {
 template<DeviceType D>
-static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
-                                      const std::vector<index_t> &filter_shape,
-                                      const std::vector<int> &dilations) {
+void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
  testing::internal::LogToStderr();
  srand(time(NULL));

@@ -515,6 +526,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    func(2, 2, SAME);
  }
 }
+}  // namespace

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
@@ -566,9 +578,10 @@ TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
                                                {4, 4});
 }

+namespace {
 template<DeviceType D, typename T>
-static void TestDilationConvNxN(const std::vector<index_t> &shape,
-                                const int dilation_rate) {
+void TestDilationConvNxN(const std::vector<index_t> &shape,
+                         const int dilation_rate) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -638,6 +651,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape,
    }
  }
 }
+}  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
@@ -651,9 +665,10 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
 }

+namespace {
 template<DeviceType D, typename T>
-static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
-                                    const std::vector<int> &paddings) {
+void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
+                             const std::vector<int> &paddings) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
    srand(time(NULL));
@@ -719,6 +734,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    }
  }
 }
+}  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});

--- a/mace/ops/cwise_benchmark.cc
+++ b/mace/ops/cwise_benchmark.cc
@@ -11,9 +11,10 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void CWise(int iters, int batch, int channels,
-                       int height, int width, float x, int type) {
+void CWise(int iters, int batch, int channels,
+           int height, int width, float x, int type) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -51,6 +52,7 @@ static void CWise(int iters, int batch, int channels,
  }
  net.Sync();
 }
+}  // namespace

 #define BM_CWISE_MACRO(N, C, H, W, X, G, TYPE, DEVICE)              \
  static void                                                             \

--- a/mace/ops/cwise_test.cc
+++ b/mace/ops/cwise_test.cc
@@ -12,7 +12,7 @@ namespace test {

 class CWiseOpTest : public OpsTestBase {};

-
+namespace {
 template <DeviceType D>
 void Simple(const kernels::CWiseType type,
            const std::vector<index_t> &shape,
@@ -56,6 +56,7 @@ void Simple(const kernels::CWiseType type,

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3);
 }
+}  // namespace

 TEST_F(CWiseOpTest, CPUSimple) {
  Simple<DeviceType::CPU>(kernels::CWiseType::MUL, {1, 1, 2, 3},
@@ -97,6 +98,7 @@ TEST_F(CWiseOpTest, GPUSimple) {
                    {1, -2, -0.0001, 4, 5, 6}, 2.0, {1, 2, 0.0001, 4, 5, 6});
 }

+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const kernels::CWiseType type,
                const std::vector<index_t> &shape) {
@@ -144,6 +146,7 @@ void RandomTest(const kernels::CWiseType type,
                            *net.GetOutput("OPENCLOutput"), 1e-1);
  }
 }
+}  // namespace

 TEST_F(CWiseOpTest, OPENCLRandomFloat) {
  RandomTest<DeviceType::OPENCL, float>(kernels::CWiseType::MUL,

--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void DepthToSpace(
+void DepthToSpace(
    int iters, int batch, int channels, int height, int width, int block_size) {
  mace::testing::StopTiming();

@@ -48,6 +49,7 @@ static void DepthToSpace(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
  static void                                                            \

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -11,6 +11,7 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D>
 void RunDepthToSpace(const bool d2s,
                     const std::vector<index_t> &input_shape,
@@ -49,6 +50,7 @@ void RunDepthToSpace(const bool d2s,
  auto expected = CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace


 class SpaceToDepthOpTest : public OpsTestBase {};
@@ -149,6 +151,7 @@ TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
 }


+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const bool d2s, const int block_size,
                const std::vector<index_t> &shape) {
@@ -197,6 +200,7 @@ void RandomTest(const bool d2s, const int block_size,
                            *net.GetOutput("OPENCLOutput"), 1e-1);
  }
 }
+}  // namespace

 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
  RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128});

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -13,17 +13,18 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void DepthwiseConv2d(int iters,
-                            int batch,
-                            int input_channels,
-                            int height,
-                            int width,
-                            int kernel_h,
-                            int kernel_w,
-                            int stride,
-                            Padding padding,
-                            int multiplier) {
+void DepthwiseConv2d(int iters,
+                     int batch,
+                     int input_channels,
+                     int height,
+                     int width,
+                     int kernel_h,
+                     int kernel_w,
+                     int stride,
+                     Padding padding,
+                     int multiplier) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -87,6 +88,7 @@ static void DepthwiseConv2d(int iters,
    net.Sync();
  }
 }
+}  // namespace

 #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE,    \
                                   DEVICE)                                    \

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class DepthwiseConv2dOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D, typename T>
 void SimpleValidTest() {
  testing::internal::LogToStderr();
@@ -69,6 +70,7 @@ void SimpleValidTest() {

  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
  SimpleValidTest<DeviceType::CPU, float>();
@@ -82,6 +84,7 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
  SimpleValidTest<DeviceType::OPENCL, half>();
 }

+namespace {
 template<DeviceType D, typename T>
 void ComplexValidTest() {
  testing::internal::LogToStderr();
@@ -188,6 +191,7 @@ void ComplexValidTest() {

  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.2);
 }
+}  // namespace

 TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
  ComplexValidTest<DeviceType::CPU, float>();
@@ -201,6 +205,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
  ComplexValidTest<DeviceType::OPENCL, half>();
 }

+namespace {
 template<DeviceType D, typename T>
 void TestNxNS12(const index_t height, const index_t width) {
  testing::internal::LogToStderr();
@@ -287,6 +292,7 @@ void TestNxNS12(const index_t height, const index_t width) {
    }
  }
 }
+}  // namespace

 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
  TestNxNS12<DeviceType::OPENCL, float>(4, 4);
@@ -314,6 +320,7 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
  TestNxNS12<DeviceType::OPENCL, half>(107, 113);
 }

+namespace {
 void TestNEONNxNS12(const index_t height,
                    const index_t width,
                    const index_t input_channels,
@@ -385,6 +392,7 @@ void TestNEONNxNS12(const index_t height,
    }
  }
 }
+}  // namespace

 TEST_F(DepthwiseConv2dOpTest, NEONTest) {
  TestNEONNxNS12(4, 4, 32, 1);

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -13,8 +13,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void EltwiseBenchmark(
+void EltwiseBenchmark(
    int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
  mace::testing::StopTiming();

@@ -59,6 +60,7 @@ static void EltwiseBenchmark(
    net.Sync();
  }
 }
+}  // namespace

 #define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE)             \
  static void                                                            \

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -12,6 +12,7 @@ namespace test {

 class EltwiseOpTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void Simple(const kernels::EltwiseType type,
            const std::vector<index_t> &shape,
@@ -61,6 +62,7 @@ void Simple(const kernels::EltwiseType type,

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3);
 }
+}  // namespace

 TEST_F(EltwiseOpTest, CPUSimple) {
  Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
@@ -98,6 +100,7 @@ TEST_F(EltwiseOpTest, GPUSimple) {
                             {1, 1, 3, 3, 5, 6});
 }

+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const kernels::EltwiseType type,
                const std::vector<index_t> &shape) {
@@ -149,6 +152,7 @@ void RandomTest(const kernels::EltwiseType type,
                            *net.GetOutput("OPENCLOutput"), 1e-1);
  }
 }
+}  // namespace

 TEST_F(EltwiseOpTest, OPENCLRandomFloat) {
  RandomTest<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD,

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class FoldedBatchNormOpTest : public OpsTestBase {};

+namespace {
 void CalculateScaleOffset(const std::vector<float> &gamma,
                          const std::vector<float> &beta,
                          const std::vector<float> &mean,
@@ -21,7 +22,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
  size_t size = gamma.size();
  for (int i = 0; i < size; ++i) {
    (*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
-    (*offset)[i] = (*offset)[i] - mean[i] * (*scale)[i];
+    (*offset)[i] = beta[i] - mean[i] * (*scale)[i];
  }
 }

@@ -76,6 +77,7 @@ void Simple() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace

 TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }


--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void FCBenchmark(
+void FCBenchmark(
    int iters, int batch, int height, int width, int channel, int out_channel) {
  mace::testing::StopTiming();

@@ -64,6 +65,7 @@ static void FCBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
  static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -13,6 +13,7 @@ namespace test {

 class FullyConnectedOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D>
 void Simple(const std::vector<index_t> &input_shape,
            const std::vector<float> &input_value,
@@ -66,6 +67,7 @@ void Simple(const std::vector<index_t> &input_shape,

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(FullyConnectedOpTest, SimpleCPU) {
  Simple<DeviceType::CPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
@@ -107,6 +109,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
                             {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }

+namespace {
 template<typename T>
 void Complex(const index_t batch,
             const index_t height,
@@ -166,6 +169,7 @@ void Complex(const index_t batch,
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3);
  }
 }
+}  // namespace

 TEST_F(FullyConnectedOpTest, OPENCLAlignedWithoutBatch) {
  Complex<float>(1, 16, 16, 32, 16);
@@ -189,6 +193,7 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
  Complex<half>(31, 21, 11, 23, 103);
 }

+namespace {
 template<typename T>
 void TestWXFormat(const index_t batch,
                  const index_t height,
@@ -247,6 +252,7 @@ void TestWXFormat(const index_t batch,
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
  }
 }
+}  // namespace

 TEST_F(FullyConnectedOpTest, OPENCLWidthFormatAligned) {
  TestWXFormat<float>(1, 7, 7, 32, 16);
@@ -266,11 +272,12 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
  TestWXFormat<half>(1, 16, 32, 32, 32);
 }

+namespace {
 void FullyConnectedTestNEON(const index_t batch,
-              const index_t height,
-              const index_t width,
-              const index_t channels,
-              const index_t out_channel) {
+                            const index_t height,
+                            const index_t width,
+                            const index_t channels,
+                            const index_t out_channel) {
  srand(time(NULL));

  // Construct graph
@@ -310,6 +317,7 @@ void FullyConnectedTestNEON(const index_t batch,
                          *net.GetOutput("OutputNeon"),
                          0.01);
 }
+}  // namespace

 TEST_F(FullyConnectedOpTest, TestNEON) {
  FullyConnectedTestNEON(1, 7, 7, 32, 16);

--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -13,6 +13,7 @@ namespace test {

 class FusedConv2dOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
  OpsTestNet net;
@@ -21,7 +22,7 @@ void TestNHWCSimple3x3VALID() {
    "Input", {1, 3, 3, 2},
    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
  net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
@@ -42,6 +43,7 @@ void TestNHWCSimple3x3VALID() {
      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());

    net.RunOp(D);
@@ -60,6 +62,7 @@ void TestNHWCSimple3x3VALID() {
      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -78,7 +81,7 @@ void TestNHWCSimple3x3SAME() {
    "Input", {1, 3, 3, 2},
    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
  net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
@@ -99,6 +102,7 @@ void TestNHWCSimple3x3SAME() {
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -117,6 +121,7 @@ void TestNHWCSimple3x3SAME() {
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -127,6 +132,7 @@ void TestNHWCSimple3x3SAME() {

  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, CPUSimple) {
  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
@@ -138,6 +144,7 @@ TEST_F(FusedConv2dOpTest, OPENCLSimple) {
  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }

+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
  OpsTestNet net;
@@ -147,7 +154,7 @@ void TestNHWCSimple3x3WithoutBias() {
    "Input", {1, 3, 3, 2},
    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
  net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});

@@ -165,6 +172,7 @@ void TestNHWCSimple3x3WithoutBias() {
      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -180,6 +188,7 @@ void TestNHWCSimple3x3WithoutBias() {
      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());

    // Run
@@ -191,6 +200,7 @@ void TestNHWCSimple3x3WithoutBias() {

  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
@@ -200,6 +210,7 @@ TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }

+namespace {
 template<DeviceType D>
 void TestConv1x1() {
  // Construct graph
@@ -216,8 +227,8 @@ void TestConv1x1() {
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
  net.AddInputFromArray<D, float>(
-    "Filter", {1, 1, 5, 2},
-    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
+    "Filter", {1, 1, 2, 5},
+    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

  if (D == DeviceType::OPENCL) {
@@ -268,13 +279,15 @@ void TestConv1x1() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }

 TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }

+namespace {
 template<DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
+void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -343,13 +356,15 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    }
  }
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
 }

+namespace {
 template<DeviceType D>
-static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
+void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -428,14 +443,16 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
    }
  }
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
 }

+namespace {
 template<DeviceType D, typename T>
-static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
-                                  const std::vector<index_t> &filter_shape) {
+void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
+                           const std::vector<index_t> &filter_shape) {
  testing::internal::LogToStderr();
  auto func = [&](int stride_h, int stride_w, Padding type) {
    srand(time(NULL));
@@ -444,10 +461,10 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
    index_t batch = 1;
    index_t height = image_shape[0];
    index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
    index_t kernel_h = filter_shape[0];
    index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -504,18 +521,20 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
    func(stride, stride, SAME);
  }
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 3, 64});
+  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 64, 3});
 }

 TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
+  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 64, 32});
 }

+namespace {
 template<DeviceType D, typename T>
-static void TestAtrousConvNxN(const std::vector<index_t> &shape,
-                              const int dilation) {
+void TestAtrousConvNxN(const std::vector<index_t> &shape,
+                       const int dilation) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -525,8 +544,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
    index_t batch = 1;
    index_t height = shape[0];
    index_t width = shape[1];
-    index_t input_channels = shape[2];
-    index_t output_channels = shape[3];
+    index_t output_channels = shape[2];
+    index_t input_channels = shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -585,6 +604,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
    }
  }
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
@@ -598,10 +618,11 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
  TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
 }

+namespace {
 template<DeviceType D>
-static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
-                                      const std::vector<index_t> &filter_shape,
-                                      const std::vector<int> &dilations) {
+void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
  testing::internal::LogToStderr();
  auto func = [&](int stride_h, int stride_w, Padding type) {
    srand(time(NULL));
@@ -610,10 +631,10 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    index_t batch = 1;
    index_t height = image_shape[0];
    index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
    index_t kernel_h = filter_shape[0];
    index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -668,9 +689,10 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
  func(1, 1, VALID);
  func(1, 1, SAME);
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 16},
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 16, 3},
                                                {2, 2});
 }

@@ -679,7 +701,8 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
                                                {2, 2});
 }

-static void TestNEONGeneralConvNxNS12(
+namespace {
+void TestNEONGeneralConvNxNS12(
  const std::vector<index_t> &image_shape,
  const std::vector<index_t> &filter_shape) {
  testing::internal::LogToStderr();
@@ -690,10 +713,10 @@ static void TestNEONGeneralConvNxNS12(
    index_t batch = 1;
    index_t height = image_shape[0];
    index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
    index_t kernel_h = filter_shape[0];
    index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -748,9 +771,10 @@ static void TestNEONGeneralConvNxNS12(
    func(stride, stride, SAME);
  }
 }
+}  // namespace

 TEST_F(FusedConv2dOpTest, NEONTest) {
-  TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 3, 64});
+  TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 64, 3});
 }
 }  // namespace test
 }  // namespace ops

--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D>
-static void GlobalAvgPooling(
+void GlobalAvgPooling(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -36,6 +37,7 @@ static void GlobalAvgPooling(
    net.RunOp(D);
  }
 }
+}  // namespace

 #define BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, DEVICE)               \
  static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void MatMulBenchmark(
+void MatMulBenchmark(
    int iters, int batch, int height, int channels, int out_width) {
  mace::testing::StopTiming();

@@ -54,6 +55,7 @@ static void MatMulBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                              \
  static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \

--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -13,6 +13,7 @@ namespace test {

 class MatMulOpTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void Simple(const std::vector<index_t> &A_shape,
            const std::vector<float> &A_value,
@@ -58,6 +59,7 @@ void Simple(const std::vector<index_t> &A_shape,

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace

 TEST_F(MatMulOpTest, SimpleCPU) {
  Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
@@ -98,6 +100,7 @@ TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
                          {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
 }

+namespace {
 template <typename T>
 void Complex(const index_t batch,
             const index_t height,
@@ -150,6 +153,7 @@ void Complex(const index_t batch,
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4);
  }
 }
+}  // namespace

 TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
  Complex<float>(1, 64, 128, 32);

--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -11,9 +11,10 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void Pad(int iters, int batch, int height,
-                int width, int channels, int pad) {
+void Pad(int iters, int batch, int height,
+         int width, int channels, int pad) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -52,6 +53,7 @@ static void Pad(int iters, int batch, int height,
  }
  net.Sync();
 }
+}  // namespace

 #define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE)                  \
  static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class PadTest : public OpsTestBase {};

+namespace {
 template <DeviceType D>
 void Simple() {
  // Construct graph
@@ -57,6 +58,7 @@ void Simple() {
                                      });
  ExpectTensorNear<float>(*expected, *output, 1e-5);
 }
+}  // namespace

 TEST_F(PadTest, SimpleCPU) {
  Simple<DeviceType::CPU>();
@@ -94,6 +96,7 @@ TEST_F(PadTest, ComplexCPU) {
  ExpectTensorNear<float>(*expected, *output, 1e-5);
 }

+namespace {
 template <typename T>
 void Complex(const std::vector<index_t> &input_shape,
             const std::vector<int> &paddings) {
@@ -139,6 +142,7 @@ void Complex(const std::vector<index_t> &input_shape,
    ExpectTensorNear<float>(expected, *output, 1e-5);
  }
 }
+}  // namespace

 TEST_F(PadTest, ComplexFloat) {
  Complex<float>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -12,16 +12,17 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D>
-static void Pooling(int iters,
-                    int batch,
-                    int channels,
-                    int height,
-                    int width,
-                    int kernel,
-                    int stride,
-                    Padding padding,
-                    PoolingType pooling_type) {
+void Pooling(int iters,
+             int batch,
+             int channels,
+             int height,
+             int width,
+             int kernel,
+             int stride,
+             Padding padding,
+             PoolingType pooling_type) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -49,6 +50,7 @@ static void Pooling(int iters,
    net.RunOp(D);
  }
 }
+}  // namespace

 #define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE)          \
  static void                                                             \

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -123,8 +123,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

+namespace {
 template<DeviceType D>
-static void SimpleMaxPooling3S2() {
+void SimpleMaxPooling3S2() {
  // Construct graph
  OpsTestNet net;

@@ -168,6 +169,7 @@ static void SimpleMaxPooling3S2() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace

 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }

@@ -175,10 +177,11 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
  SimpleMaxPooling3S2<OPENCL>();
 }

+namespace {
 template<DeviceType D, typename T>
-static void MaxPooling3S2(const std::vector<index_t> &input_shape,
-                          const std::vector<int> strides,
-                          Padding padding) {
+void MaxPooling3S2(const std::vector<index_t> &input_shape,
+                   const std::vector<int> strides,
+                   Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
@@ -218,6 +221,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,

  ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
 }
+}  // namespace

 // TODO(chenghui) : there is a bug.
 // TEST_F(PoolingOpTest, NEONAlignedMaxPooling3S2) {
@@ -275,8 +279,9 @@ TEST_F(PoolingOpTest, AVG_VALID) {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

+namespace {
 template<DeviceType D>
-static void SimpleAvgPoolingTest() {
+void SimpleAvgPoolingTest() {
  // Construct graph
  OpsTestNet net;

@@ -306,16 +311,18 @@ static void SimpleAvgPoolingTest() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace

 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
  SimpleAvgPoolingTest<OPENCL>();
 }

+namespace {
 template<DeviceType D, typename T>
-static void AvgPoolingTest(const std::vector<index_t> &shape,
-                           const std::vector<int> &kernels,
-                           const std::vector<int> &strides,
-                           Padding padding) {
+void AvgPoolingTest(const std::vector<index_t> &shape,
+                    const std::vector<int> &kernels,
+                    const std::vector<int> &strides,
+                    Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
@@ -354,6 +361,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,

  ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
 }
+}  // namespace

 TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
@@ -396,11 +404,12 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
                                Padding::SAME);
 }

-static void AvgPoolingNEONTest(const std::vector<index_t> &shape,
-                               const std::vector<int> &kernels,
-                               const std::vector<int> &strides,
-                               Padding padding,
-                               PoolingType pooling_type) {
+namespace {
+void AvgPoolingNEONTest(const std::vector<index_t> &shape,
+                        const std::vector<int> &kernels,
+                        const std::vector<int> &strides,
+                        Padding padding,
+                        PoolingType pooling_type) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
@@ -441,6 +450,7 @@ static void AvgPoolingNEONTest(const std::vector<index_t> &shape,
                          *net.GetOutput("OutputNeon"),
                          0.01);
 }
+}  // namespace

 TEST_F(PoolingOpTest, NEONTest) {
  AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},

--- a/mace/ops/reorganize_test.cc
+++ b/mace/ops/reorganize_test.cc
@@ -12,6 +12,7 @@ namespace test {

 class ReOrganizeTest : public OpsTestBase {};

+namespace {
 void TestReOrganize(const std::vector<index_t> &input_shape,
                    const std::vector<float> &input_data,
                    const std::vector<index_t> &output_shape,
@@ -69,6 +70,7 @@ void TestReOrganize(const std::vector<index_t> &input_shape,
    ASSERT_EQ(input_data[i], output_ptr[i]) << "With Index " << i;
  }
 }
+}  // namespace

 TEST_F(ReOrganizeTest, Simple) {
  TestReOrganize({1, 1, 4, 6},

--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
@@ -12,6 +12,7 @@ namespace test {

 class ReshapeTest : public OpsTestBase {};

+namespace {
 void TestReshape(const std::vector<index_t> &org_shape,
                 const std::vector<int> &output_shape,
                 const std::vector<index_t> &res_shape) {
@@ -41,6 +42,7 @@ void TestReshape(const std::vector<index_t> &org_shape,
    ASSERT_EQ(input_ptr[i], output_ptr[i]);
  }
 }
+}  // namespace

 TEST_F(ReshapeTest, Simple) {
  TestReshape({1, 2, 3, 4}, {1, 2, -1, 4}, {1, 2, 3, 4});

--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -11,14 +11,15 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void ResizeBilinearBenchmark(int iters,
-                                    int batch,
-                                    int channels,
-                                    int input_height,
-                                    int input_width,
-                                    int output_height,
-                                    int output_width) {
+void ResizeBilinearBenchmark(int iters,
+                             int batch,
+                             int channels,
+                             int input_height,
+                             int input_width,
+                             int output_height,
+                             int output_width) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -59,6 +60,7 @@ static void ResizeBilinearBenchmark(int iters,
  }
  net.Sync();
 }
+}  // namespace

 #define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE)        \
  static void                                                               \

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -63,6 +63,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

+namespace {
 template <DeviceType D>
 void TestRandomResizeBilinear() {
  testing::internal::LogToStderr();
@@ -115,6 +116,7 @@ void TestRandomResizeBilinear() {
    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
  }
 }
+}  // namespace

 /*
 TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {

--- a/mace/ops/slice_benchmark.cc
+++ b/mace/ops/slice_benchmark.cc
@@ -10,10 +10,11 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template<DeviceType D, typename T>
-static void BMSliceHelper(int iters,
-                          const std::vector<index_t> &input_shape,
-                          const index_t num_outputs) {
+void BMSliceHelper(int iters,
+                   const std::vector<index_t> &input_shape,
+                   const index_t num_outputs) {
  mace::testing::StopTiming();

  // Construct graph
@@ -60,6 +61,7 @@ static void BMSliceHelper(int iters,
    net.Sync();
  }
 }
+}  // namespace

 #define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE)                         \
  static void                                                                \

--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
@@ -15,6 +15,7 @@ namespace test {

 class SliceOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D, typename T>
 void RandomTest(const int num_outputs, const int axis) {
  static unsigned int seed = time(NULL);
@@ -104,6 +105,7 @@ void RandomTest(const int num_outputs, const int axis) {
    }
  }
 }
+}  // namespace

 TEST_F(SliceOpTest, CPU) {
  RandomTest<DeviceType::CPU, float>(2, 3);

--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void SoftmaxBenchmark(
+void SoftmaxBenchmark(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();

@@ -49,6 +50,7 @@ static void SoftmaxBenchmark(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                   \
  static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class SoftmaxOpTest : public OpsTestBase {};

+namespace {
 template<DeviceType D>
 void Simple() {
  // Construct graph
@@ -50,10 +51,12 @@ void Simple() {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
 }
+}  // namespace

 TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
 TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }

+namespace {
 template<DeviceType D>
 void Complex(const std::vector<index_t> &logits_shape) {
  // Construct graph
@@ -88,6 +91,7 @@ void Complex(const std::vector<index_t> &logits_shape) {

  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
+}  // namespace

 TEST_F(SoftmaxOpTest, OPENCLAligned) {
  Complex<DeviceType::OPENCL>({1, 256, 256, 3});
@@ -104,6 +108,7 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
  Complex<DeviceType::OPENCL>({5, 211, 107, 1});
 }

+namespace {
 void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
  // Construct graph
  OpsTestNet net;
@@ -135,6 +140,7 @@ void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
                          *net.GetOutput("OutputNeon"),
                          0.01);
 }
+}  // namespace

 TEST_F(SoftmaxOpTest, NEONTest) {
  SoftMaxNEONTest({5, 64, 64, 3});

--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void BMSpaceToBatch(
+void BMSpaceToBatch(
    int iters, int batch, int height, int width, int channels, int shape) {
  mace::testing::StopTiming();

@@ -39,6 +40,7 @@ static void BMSpaceToBatch(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE)             \
  static void                                                                \

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -11,6 +11,7 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D>
 void RunSpaceToBatch(const std::vector<index_t> &input_shape,
                     const std::vector<float> &input_data,
@@ -101,6 +102,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
  RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data, block_data,
                                      padding_data, space_tensor.get());
 }
+}  // namespace

 TEST(SpaceToBatchTest, SmallData) {
  TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},

--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void SpaceToDepth(
+void SpaceToDepth(
    int iters, int batch, int channels, int height, int width, int block_size) {
  mace::testing::StopTiming();

@@ -48,6 +49,7 @@ static void SpaceToDepth(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
  static void                                                            \

--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
@@ -11,6 +11,7 @@ namespace test {

 class TransposeOpTest : public OpsTestBase {};

+namespace {
 void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
  // Construct graph
  OpsTestNet net;
@@ -32,6 +33,7 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
                          *net.GetOutput("Output"),
                          0.01);
 }
+}  // namespace

 TEST_F(TransposeOpTest, NCHW) {
  TransposeNCHWTest({3, 64, 64, 128});

--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -14,6 +14,7 @@ namespace test {

 class WinogradConvlutionTest : public OpsTestBase {};

+namespace {
 void TransposeFilter(const std::vector<float> &input,
                     const std::vector<index_t> &input_shape,
                     std::vector<float> *output) {
@@ -131,6 +132,7 @@ void WinogradConvolution(const index_t batch,
    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-4);
  }
 }
+}  // namespace

 TEST_F(WinogradConvlutionTest, AlignedConvolution) {
  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
@@ -153,6 +155,7 @@ TEST_F(WinogradConvlutionTest, BatchConvolution) {
                                                 Padding::SAME);
 }

+namespace {
 template <DeviceType D, typename T>
 void WinogradConvolutionWithPad(const index_t batch,
                                const index_t height,
@@ -248,6 +251,7 @@ void WinogradConvolutionWithPad(const index_t batch,
    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-3);
  }
 }
+}  // namespace

 }  // namespace test
 }  // namespace ops

--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {

+namespace {
 template <DeviceType D, typename T>
-static void BMWinogradTransform(
+void BMWinogradTransform(
    int iters, int batch, int height, int width, int channels) {
  mace::testing::StopTiming();

@@ -38,6 +39,7 @@ static void BMWinogradTransform(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)                  \
  static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
@@ -56,8 +58,9 @@ BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
 BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
 BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);

+namespace {
 template <DeviceType D, typename T>
-static void BMWinogradInverseTransform(
+void BMWinogradInverseTransform(
    int iters, int batch, int height, int width, int channels) {
  mace::testing::StopTiming();

@@ -88,6 +91,7 @@ static void BMWinogradInverseTransform(
  }
  net.Sync();
 }
+}  // namespace

 #define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)          \
  static void                                                                  \

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -45,7 +45,8 @@ extern const std::string ModelChecksum();
 }  // namespace mace

 namespace mace {
-namespace examples {
+namespace tools {
+namespace validation {

 namespace str_util {

@@ -384,7 +385,8 @@ int Main(int argc, char **argv) {
  }
 }

-}  // namespace examples
+}  // namespace validation
+}  // namespace tools
 }  // namespace mace

-int main(int argc, char **argv) { mace::examples::Main(argc, argv); }
+int main(int argc, char **argv) { mace::tools::validation::Main(argc, argv); }