diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc
index 207d2de91acc2a5b3efc55df95ff2b94a89c9f04..6207234816f4a4dff6f6d3599050df7211427732 100644
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -28,7 +28,7 @@ ArgumentHelper::ArgumentHelper(const NetDef &netdef) {
   }
 }
 
-bool ArgumentHelper::HasArgument(const string &name) const {
+bool ArgumentHelper::HasArgument(const std::string &name) const {
   return arg_map_.count(name);
 }
 
@@ -44,7 +44,7 @@ bool SupportsLosslessConversion(const InputType &value) {
 #define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname,                         \
                                         enforce_lossless_conversion)          \
   template <>                                                                 \
-  T ArgumentHelper::GetSingleArgument<T>(const string &name,                  \
+  T ArgumentHelper::GetSingleArgument<T>(const std::string &name,             \
                                          const T &default_value) const {      \
     if (arg_map_.count(name) == 0) {                                          \
       VLOG(3) << "Using default parameter value " << default_value            \
@@ -63,7 +63,8 @@ bool SupportsLosslessConversion(const InputType &value) {
     return value;                                                             \
   }                                                                           \
   template <>                                                                 \
-  bool ArgumentHelper::HasSingleArgumentOfType<T>(const string &name) const { \
+  bool ArgumentHelper::HasSingleArgumentOfType<T>(                            \
+      const std::string &name) const {                                        \
     if (arg_map_.count(name) == 0) {                                          \
       return false;                                                           \
     }                                                                         \
@@ -80,28 +81,28 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(uint8_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(uint16_t, i, true)
 INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
+INSTANTIATE_GET_SINGLE_ARGUMENT(std::string, s, false)
 #undef INSTANTIATE_GET_SINGLE_ARGUMENT
 
-#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                   \
-                                          enforce_lossless_conversion)    \
-  template <>                                                             \
-  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                  \
-      const string &name, const std::vector<T> &default_value) const {    \
-    if (arg_map_.count(name) == 0) {                                      \
-      return default_value;                                               \
-    }                                                                     \
-    std::vector<T> values;                                                \
-    for (const auto &v : arg_map_.at(name).fieldname()) {                 \
-      if (enforce_lossless_conversion) {                                  \
-        auto supportsConversion =                                         \
-            SupportsLosslessConversion<decltype(v), T>(v);                \
-        MACE_CHECK(supportsConversion, "Value", v, " of argument ", name, \
-                   "cannot be represented correctly in a target type");   \
-      }                                                                   \
-      values.push_back(v);                                                \
-    }                                                                     \
-    return values;                                                        \
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                     \
+                                          enforce_lossless_conversion)      \
+  template <>                                                               \
+  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                    \
+      const std::string &name, const std::vector<T> &default_value) const { \
+    if (arg_map_.count(name) == 0) {                                        \
+      return default_value;                                                 \
+    }                                                                       \
+    std::vector<T> values;                                                  \
+    for (const auto &v : arg_map_.at(name).fieldname()) {                   \
+      if (enforce_lossless_conversion) {                                    \
+        auto supportsConversion =                                           \
+            SupportsLosslessConversion<decltype(v), T>(v);                  \
+        MACE_CHECK(supportsConversion, "Value", v, " of argument ", name,   \
+                   "cannot be represented correctly in a target type");     \
+      }                                                                     \
+      values.push_back(v);                                                  \
+    }                                                                       \
+    return values;                                                          \
   }
 
 INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
@@ -114,7 +115,7 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(uint8_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(uint16_t, ints, true)
 INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
+INSTANTIATE_GET_REPEATED_ARGUMENT(std::string, strings, false)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 
 }  // namespace mace
diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h
index 296f66e50efdf38c2107f7b55b082fc400c7ee1a..c370d5fc16a3383f7bb720587d5ca21ccf2108ae 100644
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -14,8 +14,6 @@
 
 namespace mace {
 
-using std::string;
-
 /**
  * @brief A helper class to index into arguments.
  *
@@ -27,45 +25,45 @@ using std::string;
 class ArgumentHelper {
  public:
   template <typename Def>
-  static bool HasArgument(const Def &def, const string &name) {
+  static bool HasArgument(const Def &def, const std::string &name) {
     return ArgumentHelper(def).HasArgument(name);
   }
 
   template <typename Def, typename T>
   static T GetSingleArgument(const Def &def,
-                             const string &name,
+                             const std::string &name,
                              const T &default_value) {
     return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
   }
 
   template <typename Def, typename T>
-  static bool HasSingleArgumentOfType(const Def &def, const string &name) {
+  static bool HasSingleArgumentOfType(const Def &def, const std::string &name) {
     return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
   }
 
   template <typename Def, typename T>
   static std::vector<T> GetRepeatedArgument(
       const Def &def,
-      const string &name,
+      const std::string &name,
       const std::vector<T> &default_value = std::vector<T>()) {
     return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
   }
 
   explicit ArgumentHelper(const OperatorDef &def);
   explicit ArgumentHelper(const NetDef &netdef);
-  bool HasArgument(const string &name) const;
+  bool HasArgument(const std::string &name) const;
 
   template <typename T>
-  T GetSingleArgument(const string &name, const T &default_value) const;
+  T GetSingleArgument(const std::string &name, const T &default_value) const;
   template <typename T>
-  bool HasSingleArgumentOfType(const string &name) const;
+  bool HasSingleArgumentOfType(const std::string &name) const;
   template <typename T>
   std::vector<T> GetRepeatedArgument(
-      const string &name,
+      const std::string &name,
       const std::vector<T> &default_value = std::vector<T>()) const;
 
  private:
-  std::map<string, Argument> arg_map_;
+  std::map<std::string, Argument> arg_map_;
 };
 
 }  // namespace mace
diff --git a/mace/core/net.h b/mace/core/net.h
index e14297222a933f73640fff3736664d8c0f1b1f84..2a9af0ff344df64fa412ccbabdf82daf092442be 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -28,10 +28,10 @@ class NetBase {
 
   virtual bool Run(RunMetadata *run_metadata = nullptr) = 0;
 
-  const string &Name() const { return name_; }
+  const std::string &Name() const { return name_; }
 
  protected:
-  string name_;
+  std::string name_;
   const std::shared_ptr<const OperatorRegistry> op_registry_;
 
   DISABLE_COPY_AND_ASSIGN(NetBase);
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 3ca7cd167b1e13d46e6072cea74f19d87df88ced..a22f424f2de11232617271d213bacc3c9e3dd57a 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -25,25 +25,26 @@ class OperatorBase {
   explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
   virtual ~OperatorBase() noexcept {}
 
-  inline bool HasArgument(const string &name) const {
+  inline bool HasArgument(const std::string &name) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::HasArgument(*operator_def_, name);
   }
   template <typename T>
-  inline T GetSingleArgument(const string &name, const T &default_value) const {
+  inline T GetSingleArgument(const std::string &name,
+                             const T &default_value) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
         *operator_def_, name, default_value);
   }
   template <typename T>
-  inline bool HasSingleArgumentOfType(const string &name) const {
+  inline bool HasSingleArgumentOfType(const std::string &name) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
         *operator_def_, name);
   }
   template <typename T>
   inline std::vector<T> GetRepeatedArgument(
-      const string &name, const std::vector<T> &default_value = {}) const {
+      const std::string &name, const std::vector<T> &default_value = {}) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
         *operator_def_, name, default_value);
@@ -90,14 +91,14 @@ class Operator : public OperatorBase {
  public:
   explicit Operator(const OperatorDef &operator_def, Workspace *ws)
       : OperatorBase(operator_def, ws) {
-    for (const string &input_str : operator_def.input()) {
+    for (const std::string &input_str : operator_def.input()) {
       const Tensor *tensor = ws->GetTensor(input_str);
       MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
                  ": Encountered a non-existing input tensor: ", input_str);
       inputs_.push_back(tensor);
     }
 
-    for (const string &output_str : operator_def.output()) {
+    for (const std::string &output_str : operator_def.output()) {
       if (ws->HasTensor(output_str)) {
         outputs_.push_back(ws->GetTensor(output_str));
       } else {
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 585e7424062293089e396393cfe17fd6cb160df3..e37cdb22697e3c4365cd0ab2a4aca485690c7772 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -11,21 +11,21 @@
 namespace mace {
 namespace kernels {
 
-void DepthwiseConv2d(cl::Kernel *kernel,
-                     const Tensor *input,   // NHWC
-                     const Tensor *filter,  // HWIM
-                     const Tensor *bias,
-                     const int stride,
-                     const int *paddings,
-                     const int *dilations,
-                     const ActivationType activation,
-                     const float relux_max_limit,
-                     const DataType dt,
-                     std::vector<index_t> *prev_input_shape,
-                     Tensor *output,
-                     StatsFuture *future,
-                     uint32_t *kwg_size,
-                     std::unique_ptr<BufferBase> *kernel_error) {
+static void DepthwiseConv2d(cl::Kernel *kernel,
+                            const Tensor *input,   // NHWC
+                            const Tensor *filter,  // HWIM
+                            const Tensor *bias,
+                            const int stride,
+                            const int *paddings,
+                            const int *dilations,
+                            const ActivationType activation,
+                            const float relux_max_limit,
+                            const DataType dt,
+                            std::vector<index_t> *prev_input_shape,
+                            Tensor *output,
+                            StatsFuture *future,
+                            uint32_t *kwg_size,
+                            std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 7f40aea3be137c407aea9dab95f6f6f29a2c2dc1..56312f61dff766f06d596b3e2ce1de9fa4e46391 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -8,6 +8,7 @@
 namespace mace {
 namespace kernels {
 
+namespace {
 template <typename T>
 void FCWXKernel(cl::Kernel *kernel,
                 const Tensor *input,
@@ -268,6 +269,7 @@ void FCWTXKernel(cl::Kernel *kernel,
     (*kernel_error)->UnMap();
   }
 }
+}  // namespace
 
 template <typename T>
 void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 1d294462d1cd615ee45836b325e5f8d5d8051161..ebb28220b248f7ded8e7a52123d89d6115355c6b 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -14,6 +14,7 @@
 namespace mace {
 namespace kernels {
 
+namespace {
 // [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
                            std::vector<size_t> *image_shape) {
@@ -97,6 +98,7 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
   (*image_shape)[0] = RoundUpDiv4(shape[1]);
   (*image_shape)[1] = shape[0];
 }
+}  // namespace
 
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index d3afe966e2da1892f7fc3862fb1e49cbe14ea082..0f259cdc3db0e0addcc9ab67540f4d8781eacc97 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -15,7 +15,7 @@
 namespace mace {
 namespace kernels {
 
-static std::vector<float> WHCenters(const std::vector<float> &anchor) {
+inline std::vector<float> WHCenters(const std::vector<float> &anchor) {
   // width, height, width_center, height_center
   std::vector<float> window(4);
   window[0] = anchor[2] - anchor[0] + 1;
@@ -25,7 +25,7 @@ static std::vector<float> WHCenters(const std::vector<float> &anchor) {
   return window;
 }
 
-std::vector<std::vector<float>> GenerateAnchors(
+inline std::vector<std::vector<float>> GenerateAnchors(
     const std::vector<int> &scales,
     const std::vector<float> &ratios,
     const int base_size) {
@@ -65,10 +65,10 @@ std::vector<std::vector<float>> GenerateAnchors(
   return anchors;
 }
 
-std::vector<int> nms(const float *bboxes_ptr,
-                     const index_t num_bboxes,
-                     const float thresh,
-                     const int post_nms_top_n) {
+inline std::vector<int> nms(const float *bboxes_ptr,
+                            const index_t num_bboxes,
+                            const float thresh,
+                            const int post_nms_top_n) {
   std::vector<int> keep;
   std::vector<int> suppressed(num_bboxes, 0);
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index cfc3105ba74eed1bdf8b9a13eeaea284e53e518b..5a513753771b9a3153d539a2f49ed819540c7bdc 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -31,10 +31,11 @@ inline float CalculateResizeScale(index_t in_size,
              : in_size / static_cast<float>(out_size);
 }
 
-inline void ComputeInterpolationWeights(const index_t out_size,
-                                        const index_t in_size,
-                                        const float scale,
-                                        CachedInterpolation *interpolation) {
+inline void ComputeInterpolationWeights(
+    const index_t out_size,
+    const index_t in_size,
+    const float scale,
+    CachedInterpolation *interpolation) {
   interpolation[out_size].lower = 0;
   interpolation[out_size].upper = 0;
   for (index_t i = out_size - 1; i >= 0; --i) {
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index d8a8cb726a5e8398052dec9e397c8a830fef8219..df04ff8190926288aa195203c6a74cd45dd3550e 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void ReluBenchmark(
+void ReluBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -51,6 +52,7 @@ static void ReluBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -72,8 +74,9 @@ BM_RELU(1, 3, 512, 512);
 BM_RELU(1, 32, 112, 112);
 BM_RELU(1, 64, 256, 256);
 
+namespace {
 template <DeviceType D, typename T>
-static void ReluxBenchmark(
+void ReluxBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -113,6 +116,7 @@ static void ReluxBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -134,8 +138,9 @@ BM_RELUX(1, 3, 512, 512);
 BM_RELUX(1, 32, 112, 112);
 BM_RELUX(1, 64, 256, 256);
 
+namespace {
 template <DeviceType D, typename T>
-static void PreluBenchmark(
+void PreluBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -178,6 +183,7 @@ static void PreluBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -199,8 +205,9 @@ BM_PRELU(1, 3, 512, 512);
 BM_PRELU(1, 32, 112, 112);
 BM_PRELU(1, 64, 256, 256);
 
+namespace {
 template <DeviceType D, typename T>
-static void TanhBenchmark(
+void TanhBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -238,6 +245,7 @@ static void TanhBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
@@ -259,8 +267,9 @@ BM_TANH(1, 3, 512, 512);
 BM_TANH(1, 32, 112, 112);
 BM_TANH(1, 64, 256, 256);
 
+namespace {
 template <DeviceType D, typename T>
-static void SigmoidBenchmark(
+void SigmoidBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -298,6 +307,7 @@ static void SigmoidBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE)                   \
   static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index 034a591fa21c0455bb9f6ccb35e6f12b494f4dd0..23110fe4ed88e43a5d5a9f37d83df62cf97a4be1 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class ActivationOpTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void TestSimpleRelu() {
   OpsTestNet net;
@@ -52,6 +53,7 @@ void TestSimpleRelu() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }
 
@@ -59,6 +61,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
   TestSimpleRelu<DeviceType::OPENCL>();
 }
 
+namespace {
 template <DeviceType D>
 void TestUnalignedSimpleRelu() {
   OpsTestNet net;
@@ -97,6 +100,7 @@ void TestUnalignedSimpleRelu() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
   TestUnalignedSimpleRelu<DeviceType::CPU>();
@@ -106,6 +110,8 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
   TestUnalignedSimpleRelu<DeviceType::OPENCL>();
 }
 
+
+namespace {
 template <DeviceType D>
 void TestSimpleRelux() {
   OpsTestNet net;
@@ -149,6 +155,7 @@ void TestSimpleRelux() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
 
@@ -156,6 +163,7 @@ TEST_F(ActivationOpTest, OPENCLSimple) {
   TestSimpleRelux<DeviceType::OPENCL>();
 }
 
+namespace {
 template <DeviceType D>
 void TestSimpleReluRelux() {
   OpsTestNet net;
@@ -199,6 +207,7 @@ void TestSimpleReluRelux() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimpleRelux) {
   TestSimpleReluRelux<DeviceType::CPU>();
@@ -208,6 +217,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
   TestSimpleReluRelux<DeviceType::OPENCL>();
 }
 
+namespace {
 template <DeviceType D>
 void TestSimplePrelu() {
   OpsTestNet net;
@@ -261,6 +271,7 @@ void TestSimplePrelu() {
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   }
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimplePrelu) {
   TestSimplePrelu<DeviceType::CPU>();
@@ -274,6 +285,7 @@ TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
   TestSimplePrelu<DeviceType::OPENCL>();
 }
 
+namespace {
 template <DeviceType D>
 void TestSimpleTanh() {
   OpsTestNet net;
@@ -318,6 +330,7 @@ void TestSimpleTanh() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }
 
@@ -325,6 +338,7 @@ TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
   TestSimpleTanh<DeviceType::OPENCL>();
 }
 
+namespace {
 template <DeviceType D>
 void TestSimpleSigmoid() {
   OpsTestNet net;
@@ -370,6 +384,7 @@ void TestSimpleSigmoid() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
   TestSimpleSigmoid<DeviceType::CPU>();
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index a5c5a114101ca0e00db24574906f25c664c1f742..a9538c9f218dac42d7676829c89e6f7daa64a71d 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
+void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -57,6 +58,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
     net.Sync();
   }
 }
+}  // namespace
 
 #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                       \
   static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(   \
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 068932dbfd38d79a072a1f8a8b04ab0687c206ec..37f409b61fa10166ab22188fbe4544e8c537385e 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class AddnOpTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void SimpleAdd2() {
   // Construct graph
@@ -32,9 +33,11 @@ void SimpleAdd2() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
 
+namespace {
 template <DeviceType D>
 void SimpleAdd3() {
   // Construct graph
@@ -58,9 +61,11 @@ void SimpleAdd3() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
 
+namespace {
 template <DeviceType D>
 void RandomTest() {
   testing::internal::LogToStderr();
@@ -116,6 +121,7 @@ void RandomTest() {
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
   }
 }
+}  // namespace
 
 TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
 
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 6e9f20c85b6ba5c94213c3da7ef74adabd15c843..e7a68bd6bbf49bc0a879411826c45c5a3077d838 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void BatchNorm(
+void BatchNorm(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -74,6 +75,7 @@ static void BatchNorm(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE)                  \
   static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index af3f51c569d998e58a428e31489bdbf94c5988a6..bb546a78524cc4c834ab0adc909f0adf9f7442af 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class BatchNormOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D>
 void Simple() {
   OpsTestNet net;
@@ -71,6 +72,7 @@ void Simple() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace
 
 TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
 
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index de613f0401c994ab620d9a7c7d9ae2da4f434fa3..5eaa2aee3791565f0fbb3e15404152c5de605967 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void BMBatchToSpace(
+void BMBatchToSpace(
     int iters, int batch, int channels, int height, int width, int arg) {
   mace::testing::StopTiming();
 
@@ -38,6 +39,7 @@ static void BMBatchToSpace(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE)             \
   static void                                                              \
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 375f78ac2e4b65a33079ea9cc05ef07cdd94d936..4926f5a0b23a803c5f8bdbfaedacb37a6425c4e9 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void BiasAdd(int iters, int batch, int channels, int height, int width) {
+void BiasAdd(int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -51,6 +52,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE)                  \
   static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index eff5ace0b81a665800e4a2345c58bc7a3c441247..ccfcc9451ceb8d6483d5a11652026746fae5c6ec 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class BiasAddOpTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void BiasAddSimple() {
   OpsTestNet net;
@@ -54,6 +55,7 @@ void BiasAddSimple() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace
 
 TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
 
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index 1d6d55ad181ae880f423b8444b49fb0e92255a8a..151abb8f4ceb4ae96c90c1d2c67258822396e377 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -9,6 +9,7 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
 void TestBidirectionTransform(const int type,
                               const std::vector<index_t> &input_shape) {
@@ -40,6 +41,7 @@ void TestBidirectionTransform(const int type,
   ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                       1e-5);
 }
+}  // namespace
 
 TEST(BufferToImageTest, ArgSmall) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
@@ -112,6 +114,7 @@ TEST(BufferToImageTest, Filter3x3Large) {
                                                       {3, 3, 128, 256});
 }
 
+namespace {
 template <DeviceType D, typename T>
 void TestDiffTypeBidirectionTransform(const int type,
                                       const std::vector<index_t> &input_shape) {
@@ -142,12 +145,14 @@ void TestDiffTypeBidirectionTransform(const int type,
   ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                           1e-2);
 }
+}  // namespace
 
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
   TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
                                                              {11});
 }
 
+namespace {
 template <DeviceType D, typename T>
 void TestStringHalfBidirectionTransform(const int type,
                                         const std::vector<index_t> &input_shape,
@@ -182,6 +187,7 @@ void TestStringHalfBidirectionTransform(const int type,
   ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                          1e-2);
 }
+}  // namespace
 
 TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
   const unsigned char input_data[] = {
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index c547cce6637dac8dda1c768b1aa5f8327c6e2553..9c41970459d115ebe2901a704dd1ee0f58977b1e 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void ChannelShuffle(
+void ChannelShuffle(
     int iters, int batch, int channels, int height, int width, int group) {
   mace::testing::StopTiming();
 
@@ -48,6 +49,7 @@ static void ChannelShuffle(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
   static void                                                             \
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index a8da83588cb62b49033083c2a0f85318f4bbf984..4b6b332c9a80fbd147aae60ed9208a385dec08ea 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void ConcatHelper(int iters, int concat_dim, int dim1) {
+void ConcatHelper(int iters, int concat_dim, int dim1) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -39,6 +40,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
     net.RunOp(D);
   }
 }
+}  // namespace
 
 #define BM_CONCAT_CPU_MACRO(DIM0, DIM1)                      \
   static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) {     \
@@ -51,11 +53,12 @@ BM_CONCAT_CPU_MACRO(0, 100000);
 BM_CONCAT_CPU_MACRO(1, 1000);
 BM_CONCAT_CPU_MACRO(1, 100000);
 
+namespace {
 template <typename T>
-static void OpenclConcatHelper(int iters,
-                               const std::vector<index_t> &shape0,
-                               const std::vector<index_t> &shape1,
-                               int concat_dim) {
+void OpenclConcatHelper(int iters,
+                        const std::vector<index_t> &shape0,
+                        const std::vector<index_t> &shape1,
+                        int concat_dim) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -91,6 +94,7 @@ static void OpenclConcatHelper(int iters,
     net.RunOp(DeviceType::OPENCL);
   }
 }
+}  // namespace
 
 #define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                           \
   static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index d45939577ec8b837cc4424159bc95891d3ee54f4..8a6e00d42de8fe105efed3c067420a7c9d222e4a 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -144,6 +144,7 @@ TEST_F(ConcatOpTest, CPURandom) {
   }
 }
 
+namespace {
 template <typename T>
 void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
                       const int axis) {
@@ -208,6 +209,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
     k++;
   }
 }
+}  // namespace
 
 TEST_F(ConcatOpTest, OPENCLAligned) {
   OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3);
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 98a5017dfed23b8c418c3d997fd9aa84bc7556dc..74e0bb42610c5372e58392b86a745db0125be7e9 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -13,18 +13,19 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void Conv2d(int iters,
-                   int batch,
-                   int channels,
-                   int height,
-                   int width,
-                   int kernel_h,
-                   int kernel_w,
-                   int stride,
-                   int dilation,
-                   Padding padding,
-                   int output_channels) {
+void Conv2d(int iters,
+            int batch,
+            int channels,
+            int height,
+            int width,
+            int kernel_h,
+            int kernel_w,
+            int stride,
+            int dilation,
+            Padding padding,
+            int output_channels) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -88,6 +89,7 @@ static void Conv2d(int iters,
     net.Sync();
   }
 }
+}  // namespace
 
 // In common network, there are usually more than 1 layers, this is used to
 // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 6bb541a9c35373ff92e8f0dc323555c584391f01..b56f924c255b58bae092620fb76169447365e67a 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -14,6 +14,7 @@ namespace test {
 
 class Conv2dOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
   OpsTestNet net;
@@ -129,6 +130,7 @@ void TestNHWCSimple3x3SAME() {
 
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, CPUSimple) {
   TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
@@ -140,6 +142,7 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
   TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }
 
+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
   OpsTestNet net;
@@ -193,6 +196,7 @@ void TestNHWCSimple3x3WithoutBias() {
 
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, CPUWithoutBias) {
   TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
@@ -202,8 +206,9 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
   TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestNHWCCombined3x3() {
+void TestNHWCCombined3x3() {
   // Construct graph
   OpsTestNet net;
 
@@ -263,6 +268,7 @@ static void TestNHWCCombined3x3() {
                    9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, CPUStride2) {
   TestNHWCCombined3x3<DeviceType::CPU, float>();
@@ -272,6 +278,7 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
   TestNHWCCombined3x3<DeviceType::OPENCL, float>();
 }
 
+namespace {
 template<DeviceType D>
 void TestConv1x1() {
   // Construct graph
@@ -340,14 +347,16 @@ void TestConv1x1() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
 
 TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
-                                  const int stride) {
+void TestComplexConvNxNS12(const std::vector<index_t> &shape,
+                           const int stride) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -414,6 +423,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
     func(kernel_size, kernel_size, stride, stride, SAME);
   }
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
   TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
@@ -430,10 +440,11 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
   TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
 }
 
+namespace {
 template<DeviceType D>
-static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
-                                      const std::vector<index_t> &filter_shape,
-                                      const std::vector<int> &dilations) {
+void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
   testing::internal::LogToStderr();
   srand(time(NULL));
 
@@ -515,6 +526,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     func(2, 2, SAME);
   }
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
@@ -566,9 +578,10 @@ TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
                                                 {4, 4});
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestDilationConvNxN(const std::vector<index_t> &shape,
-                                const int dilation_rate) {
+void TestDilationConvNxN(const std::vector<index_t> &shape,
+                         const int dilation_rate) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -638,6 +651,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape,
     }
   }
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
   TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
@@ -651,9 +665,10 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
   TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
-                                    const std::vector<int> &paddings) {
+void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
+                             const std::vector<int> &paddings) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
     srand(time(NULL));
@@ -719,6 +734,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
     }
   }
 }
+}  // namespace
 
 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
   TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
diff --git a/mace/ops/cwise_benchmark.cc b/mace/ops/cwise_benchmark.cc
index 6ab6aa543321f5102b731a735762eb2aaa97ca39..632accc1f66841d1543d5fee4cc92d34a9f44bfb 100644
--- a/mace/ops/cwise_benchmark.cc
+++ b/mace/ops/cwise_benchmark.cc
@@ -11,9 +11,10 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void CWise(int iters, int batch, int channels,
-                       int height, int width, float x, int type) {
+void CWise(int iters, int batch, int channels,
+           int height, int width, float x, int type) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -51,6 +52,7 @@ static void CWise(int iters, int batch, int channels,
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_CWISE_MACRO(N, C, H, W, X, G, TYPE, DEVICE)              \
   static void                                                             \
diff --git a/mace/ops/cwise_test.cc b/mace/ops/cwise_test.cc
index 7bd934f8d55c67dd8da8f91678838fe8c5b84bf9..0ae24fd52c0233b2618c6548b90861f74e11af80 100644
--- a/mace/ops/cwise_test.cc
+++ b/mace/ops/cwise_test.cc
@@ -12,7 +12,7 @@ namespace test {
 
 class CWiseOpTest : public OpsTestBase {};
 
-
+namespace {
 template <DeviceType D>
 void Simple(const kernels::CWiseType type,
             const std::vector<index_t> &shape,
@@ -56,6 +56,7 @@ void Simple(const kernels::CWiseType type,
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3);
 }
+}  // namespace
 
 TEST_F(CWiseOpTest, CPUSimple) {
   Simple<DeviceType::CPU>(kernels::CWiseType::MUL, {1, 1, 2, 3},
@@ -97,6 +98,7 @@ TEST_F(CWiseOpTest, GPUSimple) {
                     {1, -2, -0.0001, 4, 5, 6}, 2.0, {1, 2, 0.0001, 4, 5, 6});
 }
 
+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const kernels::CWiseType type,
                 const std::vector<index_t> &shape) {
@@ -144,6 +146,7 @@ void RandomTest(const kernels::CWiseType type,
                             *net.GetOutput("OPENCLOutput"), 1e-1);
   }
 }
+}  // namespace
 
 TEST_F(CWiseOpTest, OPENCLRandomFloat) {
   RandomTest<DeviceType::OPENCL, float>(kernels::CWiseType::MUL,
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index c90a8bd81c278dc5dfc3a2470097234c6dbb39f6..39f251ef5d7255a2edc6b0b5f0ac77683d685269 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void DepthToSpace(
+void DepthToSpace(
     int iters, int batch, int channels, int height, int width, int block_size) {
   mace::testing::StopTiming();
 
@@ -48,6 +49,7 @@ static void DepthToSpace(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
   static void                                                            \
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index 835e39b3cc6dada31ac72f89f1d756a6ea430ddd..93cfac5863da6a142c290d0239fa169c0b959399 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -11,6 +11,7 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D>
 void RunDepthToSpace(const bool d2s,
                      const std::vector<index_t> &input_shape,
@@ -49,6 +50,7 @@ void RunDepthToSpace(const bool d2s,
   auto expected = CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace
 
 
 class SpaceToDepthOpTest : public OpsTestBase {};
@@ -149,6 +151,7 @@ TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
 }
 
 
+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const bool d2s, const int block_size,
                 const std::vector<index_t> &shape) {
@@ -197,6 +200,7 @@ void RandomTest(const bool d2s, const int block_size,
                             *net.GetOutput("OPENCLOutput"), 1e-1);
   }
 }
+}  // namespace
 
 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
   RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128});
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 5ce7ae3d14624fa44df5d2d1dd157b9dfc211659..66e4dc5d672f2811af09ae708a8de15f7f219fbf 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -13,17 +13,18 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void DepthwiseConv2d(int iters,
-                            int batch,
-                            int input_channels,
-                            int height,
-                            int width,
-                            int kernel_h,
-                            int kernel_w,
-                            int stride,
-                            Padding padding,
-                            int multiplier) {
+void DepthwiseConv2d(int iters,
+                     int batch,
+                     int input_channels,
+                     int height,
+                     int width,
+                     int kernel_h,
+                     int kernel_w,
+                     int stride,
+                     Padding padding,
+                     int multiplier) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -87,6 +88,7 @@ static void DepthwiseConv2d(int iters,
     net.Sync();
   }
 }
+}  // namespace
 
 #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE,    \
                                    DEVICE)                                    \
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index c3bca21ff9c2e34728a922be3644199288298485..73bdceb5605d2e49f5063f6a966eed2c0d83f3a5 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class DepthwiseConv2dOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D, typename T>
 void SimpleValidTest() {
   testing::internal::LogToStderr();
@@ -69,6 +70,7 @@ void SimpleValidTest() {
 
   ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
   SimpleValidTest<DeviceType::CPU, float>();
@@ -82,6 +84,7 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
   SimpleValidTest<DeviceType::OPENCL, half>();
 }
 
+namespace {
 template<DeviceType D, typename T>
 void ComplexValidTest() {
   testing::internal::LogToStderr();
@@ -188,6 +191,7 @@ void ComplexValidTest() {
 
   ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.2);
 }
+}  // namespace
 
 TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
   ComplexValidTest<DeviceType::CPU, float>();
@@ -201,6 +205,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
   ComplexValidTest<DeviceType::OPENCL, half>();
 }
 
+namespace {
 template<DeviceType D, typename T>
 void TestNxNS12(const index_t height, const index_t width) {
   testing::internal::LogToStderr();
@@ -287,6 +292,7 @@ void TestNxNS12(const index_t height, const index_t width) {
     }
   }
 }
+}  // namespace
 
 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
   TestNxNS12<DeviceType::OPENCL, float>(4, 4);
@@ -314,6 +320,7 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
   TestNxNS12<DeviceType::OPENCL, half>(107, 113);
 }
 
+namespace {
 void TestNEONNxNS12(const index_t height,
                     const index_t width,
                     const index_t input_channels,
@@ -385,6 +392,7 @@ void TestNEONNxNS12(const index_t height,
     }
   }
 }
+}  // namespace
 
 TEST_F(DepthwiseConv2dOpTest, NEONTest) {
   TestNEONNxNS12(4, 4, 32, 1);
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 478db803b59e9dc6c6cd02e4719189f894cdf96b..7f2b046c45ef7cca0f0b32de42cf5fb06031c8c9 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -13,8 +13,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void EltwiseBenchmark(
+void EltwiseBenchmark(
     int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
   mace::testing::StopTiming();
 
@@ -59,6 +60,7 @@ static void EltwiseBenchmark(
     net.Sync();
   }
 }
+}  // namespace
 
 #define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE)             \
   static void                                                            \
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 8a0fbcd882cf68156afbf459cac32d0116478bec..36dea470f8b9e9f5a783a176cad43bda4494c4f4 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -12,6 +12,7 @@ namespace test {
 
 class EltwiseOpTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void Simple(const kernels::EltwiseType type,
             const std::vector<index_t> &shape,
@@ -61,6 +62,7 @@ void Simple(const kernels::EltwiseType type,
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3);
 }
+}  // namespace
 
 TEST_F(EltwiseOpTest, CPUSimple) {
   Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
@@ -98,6 +100,7 @@ TEST_F(EltwiseOpTest, GPUSimple) {
                              {1, 1, 3, 3, 5, 6});
 }
 
+namespace {
 template <DeviceType D, typename T>
 void RandomTest(const kernels::EltwiseType type,
                 const std::vector<index_t> &shape) {
@@ -149,6 +152,7 @@ void RandomTest(const kernels::EltwiseType type,
                             *net.GetOutput("OPENCLOutput"), 1e-1);
   }
 }
+}  // namespace
 
 TEST_F(EltwiseOpTest, OPENCLRandomFloat) {
   RandomTest<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD,
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 4c13e08c38e58477a5dbbe00885166d13d5bcddf..d46e161570a922c8f51f9e06913f713a2e6b8f82 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class FoldedBatchNormOpTest : public OpsTestBase {};
 
+namespace {
 void CalculateScaleOffset(const std::vector<float> &gamma,
                           const std::vector<float> &beta,
                           const std::vector<float> &mean,
@@ -21,7 +22,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
   size_t size = gamma.size();
   for (int i = 0; i < size; ++i) {
     (*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
-    (*offset)[i] = (*offset)[i] - mean[i] * (*scale)[i];
+    (*offset)[i] = beta[i] - mean[i] * (*scale)[i];
   }
 }
 
@@ -76,6 +77,7 @@ void Simple() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
 }
+}  // namespace
 
 TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
 
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index 2328ea8e2290f2915e825d8dbf68f25dc6bd7d49..a9a89860065567640d91b9433c04fed0be13d832 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void FCBenchmark(
+void FCBenchmark(
     int iters, int batch, int height, int width, int channel, int out_channel) {
   mace::testing::StopTiming();
 
@@ -64,6 +65,7 @@ static void FCBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
   static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index e42a4c59bfaa56442194430a5d55fa69097b398b..349acc7f03c30aaa57faa76eaeacc621f1624f32 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -13,6 +13,7 @@ namespace test {
 
 class FullyConnectedOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D>
 void Simple(const std::vector<index_t> &input_shape,
             const std::vector<float> &input_value,
@@ -66,6 +67,7 @@ void Simple(const std::vector<index_t> &input_shape,
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(FullyConnectedOpTest, SimpleCPU) {
   Simple<DeviceType::CPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
@@ -107,6 +109,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
                              {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }
 
+namespace {
 template<typename T>
 void Complex(const index_t batch,
              const index_t height,
@@ -166,6 +169,7 @@ void Complex(const index_t batch,
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3);
   }
 }
+}  // namespace
 
 TEST_F(FullyConnectedOpTest, OPENCLAlignedWithoutBatch) {
   Complex<float>(1, 16, 16, 32, 16);
@@ -189,6 +193,7 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
   Complex<half>(31, 21, 11, 23, 103);
 }
 
+namespace {
 template<typename T>
 void TestWXFormat(const index_t batch,
                   const index_t height,
@@ -247,6 +252,7 @@ void TestWXFormat(const index_t batch,
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
   }
 }
+}  // namespace
 
 TEST_F(FullyConnectedOpTest, OPENCLWidthFormatAligned) {
   TestWXFormat<float>(1, 7, 7, 32, 16);
@@ -266,11 +272,12 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
   TestWXFormat<half>(1, 16, 32, 32, 32);
 }
 
+namespace {
 void FullyConnectedTestNEON(const index_t batch,
-              const index_t height,
-              const index_t width,
-              const index_t channels,
-              const index_t out_channel) {
+                            const index_t height,
+                            const index_t width,
+                            const index_t channels,
+                            const index_t out_channel) {
   srand(time(NULL));
 
   // Construct graph
@@ -310,6 +317,7 @@ void FullyConnectedTestNEON(const index_t batch,
                           *net.GetOutput("OutputNeon"),
                           0.01);
 }
+}  // namespace
 
 TEST_F(FullyConnectedOpTest, TestNEON) {
   FullyConnectedTestNEON(1, 7, 7, 32, 16);
diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc
index 04e00ae05a8911cb3b4998e351e760ee1d1f37ba..e0fb62457254ead8b07dbcc45c74923f33b26525 100644
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -13,6 +13,7 @@ namespace test {
 
 class FusedConv2dOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
   OpsTestNet net;
@@ -21,7 +22,7 @@ void TestNHWCSimple3x3VALID() {
     "Input", {1, 3, 3, 2},
     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
   net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
     {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
   net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
@@ -42,6 +43,7 @@ void TestNHWCSimple3x3VALID() {
       .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
@@ -60,6 +62,7 @@ void TestNHWCSimple3x3VALID() {
       .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -78,7 +81,7 @@ void TestNHWCSimple3x3SAME() {
     "Input", {1, 3, 3, 2},
     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
   net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
     {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
   net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
@@ -99,6 +102,7 @@ void TestNHWCSimple3x3SAME() {
       .AddIntArg("padding", Padding::SAME)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -117,6 +121,7 @@ void TestNHWCSimple3x3SAME() {
       .AddIntArg("padding", Padding::SAME)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -127,6 +132,7 @@ void TestNHWCSimple3x3SAME() {
 
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, CPUSimple) {
   TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
@@ -138,6 +144,7 @@ TEST_F(FusedConv2dOpTest, OPENCLSimple) {
   TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }
 
+namespace {
 template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
   OpsTestNet net;
@@ -147,7 +154,7 @@ void TestNHWCSimple3x3WithoutBias() {
     "Input", {1, 3, 3, 2},
     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
   net.AddInputFromArray<D, T>(
-    "Filter", {3, 3, 2, 1},
+    "Filter", {3, 3, 1, 2},
     {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
 
@@ -165,6 +172,7 @@ void TestNHWCSimple3x3WithoutBias() {
       .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -180,6 +188,7 @@ void TestNHWCSimple3x3WithoutBias() {
       .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
       .Finalize(net.NewOperatorDef());
 
     // Run
@@ -191,6 +200,7 @@ void TestNHWCSimple3x3WithoutBias() {
 
   ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
   TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
@@ -200,6 +210,7 @@ TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
   TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }
 
+namespace {
 template<DeviceType D>
 void TestConv1x1() {
   // Construct graph
@@ -216,8 +227,8 @@ void TestConv1x1() {
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   net.AddInputFromArray<D, float>(
-    "Filter", {1, 1, 5, 2},
-    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
+    "Filter", {1, 1, 2, 5},
+    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
   if (D == DeviceType::OPENCL) {
@@ -268,13 +279,15 @@ void TestConv1x1() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
 
 TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
+void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -343,13 +356,15 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     }
   }
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
   TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
 }
 
+namespace {
 template<DeviceType D>
-static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
+void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -428,14 +443,16 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
     }
   }
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
-                                  const std::vector<index_t> &filter_shape) {
+void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
+                           const std::vector<index_t> &filter_shape) {
   testing::internal::LogToStderr();
   auto func = [&](int stride_h, int stride_w, Padding type) {
     srand(time(NULL));
@@ -444,10 +461,10 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
     index_t batch = 1;
     index_t height = image_shape[0];
     index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
     index_t kernel_h = filter_shape[0];
     index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -504,18 +521,20 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
     func(stride, stride, SAME);
   }
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 3, 64});
+  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 64, 3});
 }
 
 TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
+  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 64, 32});
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void TestAtrousConvNxN(const std::vector<index_t> &shape,
-                              const int dilation) {
+void TestAtrousConvNxN(const std::vector<index_t> &shape,
+                       const int dilation) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -525,8 +544,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
     index_t batch = 1;
     index_t height = shape[0];
     index_t width = shape[1];
-    index_t input_channels = shape[2];
-    index_t output_channels = shape[3];
+    index_t output_channels = shape[2];
+    index_t input_channels = shape[3];
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -585,6 +604,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
     }
   }
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
   TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
@@ -598,10 +618,11 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
   TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
 }
 
+namespace {
 template<DeviceType D>
-static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
-                                      const std::vector<index_t> &filter_shape,
-                                      const std::vector<int> &dilations) {
+void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
   testing::internal::LogToStderr();
   auto func = [&](int stride_h, int stride_w, Padding type) {
     srand(time(NULL));
@@ -610,10 +631,10 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     index_t batch = 1;
     index_t height = image_shape[0];
     index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
     index_t kernel_h = filter_shape[0];
     index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -668,9 +689,10 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
   func(1, 1, VALID);
   func(1, 1, SAME);
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 16},
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 16, 3},
                                                 {2, 2});
 }
 
@@ -679,7 +701,8 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
                                                 {2, 2});
 }
 
-static void TestNEONGeneralConvNxNS12(
+namespace {
+void TestNEONGeneralConvNxNS12(
   const std::vector<index_t> &image_shape,
   const std::vector<index_t> &filter_shape) {
   testing::internal::LogToStderr();
@@ -690,10 +713,10 @@ static void TestNEONGeneralConvNxNS12(
     index_t batch = 1;
     index_t height = image_shape[0];
     index_t width = image_shape[1];
-    index_t input_channels = filter_shape[2];
-    index_t output_channels = filter_shape[3];
     index_t kernel_h = filter_shape[0];
     index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -748,9 +771,10 @@ static void TestNEONGeneralConvNxNS12(
     func(stride, stride, SAME);
   }
 }
+}  // namespace
 
 TEST_F(FusedConv2dOpTest, NEONTest) {
-  TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 3, 64});
+  TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 64, 3});
 }
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc
index 0e8126bc3704e728dfd935d38c59ed1ddcf3c1b2..b5a8e9c624da82b6dc541d23b4f41ce89260ad8a 100644
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
@@ -11,8 +11,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D>
-static void GlobalAvgPooling(
+void GlobalAvgPooling(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -36,6 +37,7 @@ static void GlobalAvgPooling(
     net.RunOp(D);
   }
 }
+}  // namespace
 
 #define BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, DEVICE)               \
   static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index 850acb184d404befd0c7897831175ff2cabe8e74..a547a25bc6fb7701877868b7767e2b00f21032d5 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void MatMulBenchmark(
+void MatMulBenchmark(
     int iters, int batch, int height, int channels, int out_width) {
   mace::testing::StopTiming();
 
@@ -54,6 +55,7 @@ static void MatMulBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                              \
   static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index 192cfc31f63b77faa791e632239b99b433f9a6ad..afd92df2333f1e378528ec640e01d5b91d91619c 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -13,6 +13,7 @@ namespace test {
 
 class MatMulOpTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void Simple(const std::vector<index_t> &A_shape,
             const std::vector<float> &A_value,
@@ -58,6 +59,7 @@ void Simple(const std::vector<index_t> &A_shape,
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+}  // namespace
 
 TEST_F(MatMulOpTest, SimpleCPU) {
   Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
@@ -98,6 +100,7 @@ TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
                           {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64});
 }
 
+namespace {
 template <typename T>
 void Complex(const index_t batch,
              const index_t height,
@@ -150,6 +153,7 @@ void Complex(const index_t batch,
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4);
   }
 }
+}  // namespace
 
 TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
   Complex<float>(1, 64, 128, 32);
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index 947c7aa8d83e7dc55271f6985f8c1a38ddc2e050..52b87d77f9e9e8696d1fdfbfdf0660e5042796da 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -11,9 +11,10 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void Pad(int iters, int batch, int height,
-                int width, int channels, int pad) {
+void Pad(int iters, int batch, int height,
+         int width, int channels, int pad) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -52,6 +53,7 @@ static void Pad(int iters, int batch, int height,
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE)                  \
   static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index 4aac54bbc340fe409920097ef22a77c18d327d5b..14d30ca5950f84c2c22a995669a05cec10889375 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class PadTest : public OpsTestBase {};
 
+namespace {
 template <DeviceType D>
 void Simple() {
   // Construct graph
@@ -57,6 +58,7 @@ void Simple() {
                                       });
   ExpectTensorNear<float>(*expected, *output, 1e-5);
 }
+}  // namespace
 
 TEST_F(PadTest, SimpleCPU) {
   Simple<DeviceType::CPU>();
@@ -94,6 +96,7 @@ TEST_F(PadTest, ComplexCPU) {
   ExpectTensorNear<float>(*expected, *output, 1e-5);
 }
 
+namespace {
 template <typename T>
 void Complex(const std::vector<index_t> &input_shape,
              const std::vector<int> &paddings) {
@@ -139,6 +142,7 @@ void Complex(const std::vector<index_t> &input_shape,
     ExpectTensorNear<float>(expected, *output, 1e-5);
   }
 }
+}  // namespace
 
 TEST_F(PadTest, ComplexFloat) {
   Complex<float>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index dd5c57a9971291fd6510e5e8ba398b86273683b4..546c02efa8f51315e2ff2de20b69b9b6bb63b850 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -12,16 +12,17 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D>
-static void Pooling(int iters,
-                    int batch,
-                    int channels,
-                    int height,
-                    int width,
-                    int kernel,
-                    int stride,
-                    Padding padding,
-                    PoolingType pooling_type) {
+void Pooling(int iters,
+             int batch,
+             int channels,
+             int height,
+             int width,
+             int kernel,
+             int stride,
+             Padding padding,
+             PoolingType pooling_type) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -49,6 +50,7 @@ static void Pooling(int iters,
     net.RunOp(D);
   }
 }
+}  // namespace
 
 #define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE)          \
   static void                                                             \
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index d74b6fdf8eed926455a551f1b6254fc8e19c6d43..d65af7c611b9f4f6acb80c3e4901d0046efb1821 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -123,8 +123,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
+namespace {
 template<DeviceType D>
-static void SimpleMaxPooling3S2() {
+void SimpleMaxPooling3S2() {
   // Construct graph
   OpsTestNet net;
 
@@ -168,6 +169,7 @@ static void SimpleMaxPooling3S2() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace
 
 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }
 
@@ -175,10 +177,11 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
   SimpleMaxPooling3S2<OPENCL>();
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void MaxPooling3S2(const std::vector<index_t> &input_shape,
-                          const std::vector<int> strides,
-                          Padding padding) {
+void MaxPooling3S2(const std::vector<index_t> &input_shape,
+                   const std::vector<int> strides,
+                   Padding padding) {
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Pooling", "PoolingTest")
@@ -218,6 +221,7 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
 
   ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
 }
+}  // namespace
 
 // TODO(chenghui) : there is a bug.
 // TEST_F(PoolingOpTest, NEONAlignedMaxPooling3S2) {
@@ -275,8 +279,9 @@ TEST_F(PoolingOpTest, AVG_VALID) {
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
+namespace {
 template<DeviceType D>
-static void SimpleAvgPoolingTest() {
+void SimpleAvgPoolingTest() {
   // Construct graph
   OpsTestNet net;
 
@@ -306,16 +311,18 @@ static void SimpleAvgPoolingTest() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+}  // namespace
 
 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
   SimpleAvgPoolingTest<OPENCL>();
 }
 
+namespace {
 template<DeviceType D, typename T>
-static void AvgPoolingTest(const std::vector<index_t> &shape,
-                           const std::vector<int> &kernels,
-                           const std::vector<int> &strides,
-                           Padding padding) {
+void AvgPoolingTest(const std::vector<index_t> &shape,
+                    const std::vector<int> &kernels,
+                    const std::vector<int> &strides,
+                    Padding padding) {
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Pooling", "PoolingTest")
@@ -354,6 +361,7 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
 
   ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
 }
+}  // namespace
 
 TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
   AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
@@ -396,11 +404,12 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
                                 Padding::SAME);
 }
 
-static void AvgPoolingNEONTest(const std::vector<index_t> &shape,
-                               const std::vector<int> &kernels,
-                               const std::vector<int> &strides,
-                               Padding padding,
-                               PoolingType pooling_type) {
+namespace {
+void AvgPoolingNEONTest(const std::vector<index_t> &shape,
+                        const std::vector<int> &kernels,
+                        const std::vector<int> &strides,
+                        Padding padding,
+                        PoolingType pooling_type) {
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Pooling", "PoolingTest")
@@ -441,6 +450,7 @@ static void AvgPoolingNEONTest(const std::vector<index_t> &shape,
                           *net.GetOutput("OutputNeon"),
                           0.01);
 }
+}  // namespace
 
 TEST_F(PoolingOpTest, NEONTest) {
   AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
diff --git a/mace/ops/reorganize_test.cc b/mace/ops/reorganize_test.cc
index 68e0886718d8728371878eff2eaa2e2b505d22d6..2c89369b51f6b764eabf6772a4135629f4c22f30 100644
--- a/mace/ops/reorganize_test.cc
+++ b/mace/ops/reorganize_test.cc
@@ -12,6 +12,7 @@ namespace test {
 
 class ReOrganizeTest : public OpsTestBase {};
 
+namespace {
 void TestReOrganize(const std::vector<index_t> &input_shape,
                     const std::vector<float> &input_data,
                     const std::vector<index_t> &output_shape,
@@ -69,6 +70,7 @@ void TestReOrganize(const std::vector<index_t> &input_shape,
     ASSERT_EQ(input_data[i], output_ptr[i]) << "With Index " << i;
   }
 }
+}  // namespace
 
 TEST_F(ReOrganizeTest, Simple) {
   TestReOrganize({1, 1, 4, 6},
diff --git a/mace/ops/reshape_test.cc b/mace/ops/reshape_test.cc
index e8c363a61e255f28657b9a7cf0f4955e4dcd279b..56e6feb50c2e36a17b5eab47038de7f4bce4a787 100644
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
@@ -12,6 +12,7 @@ namespace test {
 
 class ReshapeTest : public OpsTestBase {};
 
+namespace {
 void TestReshape(const std::vector<index_t> &org_shape,
                  const std::vector<int> &output_shape,
                  const std::vector<index_t> &res_shape) {
@@ -41,6 +42,7 @@ void TestReshape(const std::vector<index_t> &org_shape,
     ASSERT_EQ(input_ptr[i], output_ptr[i]);
   }
 }
+}  // namespace
 
 TEST_F(ReshapeTest, Simple) {
   TestReshape({1, 2, 3, 4}, {1, 2, -1, 4}, {1, 2, 3, 4});
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index aa66d346c983db82f20074395e1155745cd0d18f..8aada066957d7ac6c7a69788b00e046ffeef8cb0 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -11,14 +11,15 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void ResizeBilinearBenchmark(int iters,
-                                    int batch,
-                                    int channels,
-                                    int input_height,
-                                    int input_width,
-                                    int output_height,
-                                    int output_width) {
+void ResizeBilinearBenchmark(int iters,
+                             int batch,
+                             int channels,
+                             int input_height,
+                             int input_width,
+                             int output_height,
+                             int output_width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -59,6 +60,7 @@ static void ResizeBilinearBenchmark(int iters,
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE)        \
   static void                                                               \
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 896fe630842c524d77c5a2066792408bffca492f..cd1b81d0eec455c8554adaa5a46807501144cf15 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -63,6 +63,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
+namespace {
 template <DeviceType D>
 void TestRandomResizeBilinear() {
   testing::internal::LogToStderr();
@@ -115,6 +116,7 @@ void TestRandomResizeBilinear() {
     ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
   }
 }
+}  // namespace
 
 /*
 TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
diff --git a/mace/ops/slice_benchmark.cc b/mace/ops/slice_benchmark.cc
index a38c995d7f6c46d493d511d5f10ef1b9a1790f09..2c56362197bb2b9114a7c3726f86728f2b4094f3 100644
--- a/mace/ops/slice_benchmark.cc
+++ b/mace/ops/slice_benchmark.cc
@@ -10,10 +10,11 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template<DeviceType D, typename T>
-static void BMSliceHelper(int iters,
-                          const std::vector<index_t> &input_shape,
-                          const index_t num_outputs) {
+void BMSliceHelper(int iters,
+                   const std::vector<index_t> &input_shape,
+                   const index_t num_outputs) {
   mace::testing::StopTiming();
 
   // Construct graph
@@ -60,6 +61,7 @@ static void BMSliceHelper(int iters,
     net.Sync();
   }
 }
+}  // namespace
 
 #define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE)                         \
   static void                                                                \
diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc
index ad507af380a29ee0a3d138e983d4fe34396ad75f..4264315f42df0c878433915d3496c81bbf73b48c 100644
--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
@@ -15,6 +15,7 @@ namespace test {
 
 class SliceOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D, typename T>
 void RandomTest(const int num_outputs, const int axis) {
   static unsigned int seed = time(NULL);
@@ -104,6 +105,7 @@ void RandomTest(const int num_outputs, const int axis) {
     }
   }
 }
+}  // namespace
 
 TEST_F(SliceOpTest, CPU) {
   RandomTest<DeviceType::CPU, float>(2, 3);
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index fb6dc4ef27ae2ae6904d5a598a70484724fd24ef..1d010c2f318e788b5b8389a9f36ef0ec8c4af74a 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void SoftmaxBenchmark(
+void SoftmaxBenchmark(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
@@ -49,6 +50,7 @@ static void SoftmaxBenchmark(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                   \
   static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 97afa33069ff4ac083b5ea8af1d2d8ac9de3333b..156b84d3ef302662fb419d933d9634f5d40f507f 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class SoftmaxOpTest : public OpsTestBase {};
 
+namespace {
 template<DeviceType D>
 void Simple() {
   // Construct graph
@@ -50,10 +51,12 @@ void Simple() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
 }
+}  // namespace
 
 TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
 TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
 
+namespace {
 template<DeviceType D>
 void Complex(const std::vector<index_t> &logits_shape) {
   // Construct graph
@@ -88,6 +91,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
 
   ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
+}  // namespace
 
 TEST_F(SoftmaxOpTest, OPENCLAligned) {
   Complex<DeviceType::OPENCL>({1, 256, 256, 3});
@@ -104,6 +108,7 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
   Complex<DeviceType::OPENCL>({5, 211, 107, 1});
 }
 
+namespace {
 void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
   // Construct graph
   OpsTestNet net;
@@ -135,6 +140,7 @@ void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
                           *net.GetOutput("OutputNeon"),
                           0.01);
 }
+}  // namespace
 
 TEST_F(SoftmaxOpTest, NEONTest) {
   SoftMaxNEONTest({5, 64, 64, 3});
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index 62a4f7ddf5c51dd29266321f5e3ec779ed00edde..efdfa29ab42a1815137cc23f5e1ccf65c0e12daa 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void BMSpaceToBatch(
+void BMSpaceToBatch(
     int iters, int batch, int height, int width, int channels, int shape) {
   mace::testing::StopTiming();
 
@@ -39,6 +40,7 @@ static void BMSpaceToBatch(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE)             \
   static void                                                                \
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index 452a9638c14134cfbbb37feb0d7236798d6980a8..a62b3493026615a8976b0d0dfcf23d69f5759ae2 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -11,6 +11,7 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D>
 void RunSpaceToBatch(const std::vector<index_t> &input_shape,
                      const std::vector<float> &input_data,
@@ -101,6 +102,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
   RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data, block_data,
                                       padding_data, space_tensor.get());
 }
+}  // namespace
 
 TEST(SpaceToBatchTest, SmallData) {
   TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index c97028c4c85cd792769f4fd69fc19ffe9a1280c0..e5f9942168d96b0658a6491a6117750c8258bf08 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void SpaceToDepth(
+void SpaceToDepth(
     int iters, int batch, int channels, int height, int width, int block_size) {
   mace::testing::StopTiming();
 
@@ -48,6 +49,7 @@ static void SpaceToDepth(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE)             \
   static void                                                            \
diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc
index ecc8d08d916b7df82da767d01ac5148bb56f1445..bd61e132b5659cbd865a63d9037dc9f1e05c0cbc 100644
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
@@ -11,6 +11,7 @@ namespace test {
 
 class TransposeOpTest : public OpsTestBase {};
 
+namespace {
 void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
   // Construct graph
   OpsTestNet net;
@@ -32,6 +33,7 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
                           *net.GetOutput("Output"),
                           0.01);
 }
+}  // namespace
 
 TEST_F(TransposeOpTest, NCHW) {
   TransposeNCHWTest({3, 64, 64, 128});
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 1f335c6bd9642adc20039f16a1b2158ff97f9cab..ce9bcdca6b27f9ef34cb73130db79b50c1db7cc9 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -14,6 +14,7 @@ namespace test {
 
 class WinogradConvlutionTest : public OpsTestBase {};
 
+namespace {
 void TransposeFilter(const std::vector<float> &input,
                      const std::vector<index_t> &input_shape,
                      std::vector<float> *output) {
@@ -131,6 +132,7 @@ void WinogradConvolution(const index_t batch,
     ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-4);
   }
 }
+}  // namespace
 
 TEST_F(WinogradConvlutionTest, AlignedConvolution) {
   WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
@@ -153,6 +155,7 @@ TEST_F(WinogradConvlutionTest, BatchConvolution) {
                                                  Padding::SAME);
 }
 
+namespace {
 template <DeviceType D, typename T>
 void WinogradConvolutionWithPad(const index_t batch,
                                 const index_t height,
@@ -248,6 +251,7 @@ void WinogradConvolutionWithPad(const index_t batch,
     ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-3);
   }
 }
+}  // namespace
 
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
index bd20ae9f28ac28f664944a735162569d4c0a61d4..c74b548d1abf859df6b71c2dcc33fdd65daab4bc 100644
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -10,8 +10,9 @@ namespace mace {
 namespace ops {
 namespace test {
 
+namespace {
 template <DeviceType D, typename T>
-static void BMWinogradTransform(
+void BMWinogradTransform(
     int iters, int batch, int height, int width, int channels) {
   mace::testing::StopTiming();
 
@@ -38,6 +39,7 @@ static void BMWinogradTransform(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)                  \
   static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
@@ -56,8 +58,9 @@ BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
 BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
 BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
 
+namespace {
 template <DeviceType D, typename T>
-static void BMWinogradInverseTransform(
+void BMWinogradInverseTransform(
     int iters, int batch, int height, int width, int channels) {
   mace::testing::StopTiming();
 
@@ -88,6 +91,7 @@ static void BMWinogradInverseTransform(
   }
   net.Sync();
 }
+}  // namespace
 
 #define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)          \
   static void                                                                  \
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 066ecd3f871e5cc03459bb6945450707b091ecf7..281cbede5c50ab82b6aa0d8a41be1fc4940ff60b 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -45,7 +45,8 @@ extern const std::string ModelChecksum();
 }  // namespace mace
 
 namespace mace {
-namespace examples {
+namespace tools {
+namespace validation {
 
 namespace str_util {
 
@@ -384,7 +385,8 @@ int Main(int argc, char **argv) {
   }
 }
 
-}  // namespace examples
+}  // namespace validation
+}  // namespace tools
 }  // namespace mace
 
-int main(int argc, char **argv) { mace::examples::Main(argc, argv); }
+int main(int argc, char **argv) { mace::tools::validation::Main(argc, argv); }