diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 9287feb5cf19c887e6cbd719b48f26577037fcc9..84312a03d2e59d10fd76eec93f9e4cff2199696a 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -13,16 +13,13 @@ namespace kernels {
 
 template <DeviceType D, typename T>
 struct BatchNormFunctor {
-  float variance_epsilon_;
-
-  BatchNormFunctor(const float variance_epsilon)
-      : variance_epsilon_(variance_epsilon) {}
 
   void operator()(const T* input,
                   const T* scale,
                   const T* offset,
                   const T* mean,
                   const T* var,
+                  const float variance_epsilon,
                   const index_t n,
                   const index_t channel,
                   const index_t sample_size,
@@ -37,7 +34,7 @@ struct BatchNormFunctor {
     // Y = new_scale * X + new_offset;
     T new_scale, new_offset;
     for (index_t c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
+      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
       new_offset = offset[c] - mean[c] * new_scale;
       index_t pos = c * sample_size;
 
@@ -60,6 +57,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
     const float* offset,
     const float* mean,
     const float* var,
+    const float variance_epsilon,
     const index_t n,
     const index_t channel,
     const index_t sample_size,
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 9c99695bc0712f53b482ac2445500b6daef43eae..ca7b0f1a169fcb5e91f711be1ba7f24c1af3ce58 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -15,6 +15,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
     const float* offset,
     const float* mean,
     const float* var,
+    const float variance_epsilon,
     const index_t n,
     const index_t channel,
     const index_t sample_size,
@@ -31,7 +32,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
   index_t count = sample_size >> 2;
   index_t remain_count = sample_size - (count << 2);
   for (index_t c = 0; c < channel; ++c) {
-    new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
+    new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
     new_offset = offset[c] - mean[c] * new_scale;
     index_t pos = c * sample_size;
 
diff --git a/mace/kernels/neon/max_pooling_neon_3x3.cc b/mace/kernels/neon/max_pooling_neon_3x3.cc
index 5a8bf246c9d338b9e777df2caeabda81bf86c47b..0c7a74d0b0d1133d9367ceac158240e84aa49d83 100644
--- a/mace/kernels/neon/max_pooling_neon_3x3.cc
+++ b/mace/kernels/neon/max_pooling_neon_3x3.cc
@@ -3,7 +3,6 @@
 //
 
 #include <arm_neon.h>
-#include <float.h>
 #include <limits>
 
 #include "mace/core/common.h"
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index a9b1f9f52b3087f1daebee25b6daedac964e4922..e92d9ebb69de1e88512808bfb674842b8a6346c8 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -15,8 +15,7 @@ class BatchNormOp : public Operator<D, T> {
  public:
   BatchNormOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<D, T>(operator_def, ws),
-        functor_(
-            OperatorBase::GetSingleArgument<float>("variance_epsilon", 1e-4)) {}
+        functor_() {}
 
   bool Run() override {
     const Tensor* input = this->Input(0);
@@ -24,6 +23,7 @@ class BatchNormOp : public Operator<D, T> {
     const Tensor* offset = this->Input(2);
     const Tensor* mean = this->Input(3);
     const Tensor* var = this->Input(4);
+    const Tensor* epsilon = this->Input(5);
 
     MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
                input->dim_size());
@@ -35,6 +35,8 @@ class BatchNormOp : public Operator<D, T> {
                mean->dim_size());
     MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
                var->dim_size());
+    MACE_CHECK(epsilon->dim_size() == 0, "epsilon must be 0-dimensional. ",
+               epsilon->dim_size());
 
     Tensor* output = this->Output(0);
     output->ResizeLike(input);
@@ -48,9 +50,10 @@ class BatchNormOp : public Operator<D, T> {
     const T* offset_ptr = offset->data<T>();
     const T* mean_ptr = mean->data<T>();
     const T* var_ptr = var->data<T>();
+    const T* epsilon_ptr = epsilon->data<T>();
     T* output_ptr = output->mutable_data<T>();
 
-    functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, n, channel,
+    functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, *epsilon_ptr, n, channel,
              sample_size, output_ptr);
     return true;
   }
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index ecd647d4394439a79b77197e5e8ea46718ae0efa..079ad6f1a15c82b98487ec3850b21ee29accb19e 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -19,6 +19,7 @@ static void BatchNorm(
       .Input("Offset")
       .Input("Mean")
       .Input("Var")
+      .Input("Epsilon")
       .Output("Output")
       .Finalize(net.operator_def());
 
@@ -28,6 +29,7 @@ static void BatchNorm(
   net.AddRandomInput<T>("Offset", {channels});
   net.AddRandomInput<T>("Mean", {channels});
   net.AddRandomInput<T>("Var", {channels}, true);
+  net.AddInputFromArray<float>("Epsilon", {}, {1e-3});
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index f963de217ef2527ad890ffffedb6f1f68eb7a2d0..fd503ed567115d8c0508b642ccac4b07402b7cb0 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -18,6 +18,7 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
       .Input("Offset")
       .Input("Mean")
       .Input("Var")
+      .Input("Epsilon")
       .Output("Output")
       .Finalize(net.operator_def());
 
@@ -28,6 +29,7 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
   net.AddInputFromArray<float>("Offset", {1}, {2.0});
   net.AddInputFromArray<float>("Mean", {1}, {10});
   net.AddInputFromArray<float>("Var", {1}, {11.67f});
+  net.AddInputFromArray<float>("Epsilon", {}, {1e-3});
 
   // Run
   net.RunOp();
@@ -46,8 +48,8 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
   // generate random input
   index_t batch = 1 + rand() % 10;
   index_t channels = 3 + rand() % 50;
-  index_t height = 10 + rand() % 50;
-  index_t width = 10 + rand() % 50;
+  index_t height = 103;
+  index_t width = 113;
   // Construct graph
   auto& net = test_net();
   OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -56,6 +58,7 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
       .Input("Offset")
       .Input("Mean")
       .Input("Var")
+      .Input("Epsilon")
       .Output("Output")
       .Finalize(net.operator_def());
 
@@ -65,6 +68,7 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
   net.AddRandomInput<float>("Offset", {channels});
   net.AddRandomInput<float>("Mean", {channels});
   net.AddRandomInput<float>("Var", {channels}, true);
+  net.AddInputFromArray<float>("Epsilon", {}, {1e-3});
 
   // run cpu
   net.RunOp();