Fix bugs of batch norm and graph transformation.

bc576436 · liuqi · 286d1e21 · bc576436 · bc576436 · bc576436
8 changed file
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -12,16 +12,26 @@
 namespace mace {
 namespace kernels {

+struct BatchNormFunctorBase {
+  BatchNormFunctorBase(bool folded_constant, bool fused_relu) :
+      folded_constant_(folded_constant),
+      fused_relu_(fused_relu){}
+
+  const bool folded_constant_;
+  const bool fused_relu_;
+};
+
 template <DeviceType D, typename T>
-struct BatchNormFunctor {
+struct BatchNormFunctor : BatchNormFunctorBase{
+  BatchNormFunctor(const bool folded_constant, const bool fused_relu) :
+      BatchNormFunctorBase(folded_constant, fused_relu) {}
+
  void operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,
                  const Tensor *mean,
                  const Tensor *var,
                  const float epsilon,
-                  const bool folded_constant,
-                  const bool fused_relu,
                  Tensor *output,
                  StatsFuture *future) {
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
@@ -45,17 +55,17 @@ struct BatchNormFunctor {
    const T *input_ptr = input->data<T>();
    const T *scale_ptr = scale->data<T>();
    const T *offset_ptr = offset->data<T>();
-    const T *mean_ptr = mean->data<T>();
-    const T *var_ptr = var->data<T>();
    T *output_ptr = output->mutable_data<T>();

    vector<T> new_scale;
    vector<T> new_offset;
-    if (!folded_constant) {
+    if (!folded_constant_) {
      new_scale.resize(channels);
      new_offset.resize(channels);
      Tensor::MappingGuard mean_mapper(mean);
      Tensor::MappingGuard var_mapper(var);
+      const T *mean_ptr = mean->data<T>();
+      const T *var_ptr = var->data<T>();
 #pragma omp parallel for
      for (index_t c = 0; c < channels; ++c) {
        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
@@ -70,12 +80,12 @@ struct BatchNormFunctor {
      for (index_t h = 0; h < height; ++h) {
        for (index_t w = 0; w < width; ++w) {
          for (index_t c = 0; c < channels; ++c) {
-            if (folded_constant) {
+            if (folded_constant_) {
              output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
            } else {
              output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
            }
-            if (fused_relu) {
+            if (fused_relu_) {
              output_ptr[pos] = std::max(output_ptr[pos], static_cast<T>(0));
            }
            ++pos;
@@ -94,21 +104,19 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *mean,
    const Tensor *var,
    const float epsilon,
-    const bool folded_constant,
-    const bool fused_relu,
    Tensor *output,
    StatsFuture *future);

 template <typename T>
-struct BatchNormFunctor<DeviceType::OPENCL, T> {
+struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
+  BatchNormFunctor(const bool folded_constant, const bool fused_relu) :
+      BatchNormFunctorBase(folded_constant, fused_relu) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,
                  const Tensor *mean,
                  const Tensor *var,
                  const float epsilon,
-                  const bool folded_constant,
-                  const bool fused_relu,
                  Tensor *output,
                  StatsFuture *future);
 };

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -20,11 +20,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *mean,
    const Tensor *var,
    const float epsilon,
-    const bool folded_constant,
-    const bool fused_relu,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(folded_constant || (mean != nullptr && var != nullptr));
+  MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));

  const index_t batch = input->dim(0);
  const index_t height = input->dim(1);
@@ -38,10 +36,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  if (folded_constant) {
+  if (folded_constant_) {
    built_options.emplace("-DFOLDED_CONSTANT");
  }
-  if (fused_relu) {
+  if (fused_relu_) {
    built_options.emplace("-DFUSED_RELU");
  }
  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
@@ -50,7 +48,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(offset->buffer())));
-  if (!folded_constant) {
+  if (!folded_constant_) {
    bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(mean->buffer())));
    bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
    bm_kernel.setArg(idx++, epsilon);
@@ -103,7 +101,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3) << "_"
-     << folded_constant;
+     << folded_constant_;
  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -25,7 +25,7 @@ __kernel void batch_norm(__read_only image2d_t input,

  // native_rsqrt seems not faster than rsqrt
  DATA_TYPE4 bn_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
-  DATA_TYPE4 bn_offset = mad(0 - mean_value, new_scale, offset_value);
+  DATA_TYPE4 bn_offset = mad(0 - mean_value, bn_scale, offset_value);
 #endif

  const int pos = mad24(ch_blk, width, w);

--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -14,7 +14,7 @@ template <DeviceType D, class T>
 class BatchNormOp : public Operator<D, T> {
 public:
  BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {
+      : Operator<D, T>(operator_def, ws), functor_(false, false) {
    epsilon_ =
        OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
  }
@@ -40,8 +40,7 @@ class BatchNormOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    output->ResizeLike(input);

-    functor_(input, scale, offset, mean, var, epsilon_,
-             false, false, output, future);
+    functor_(input, scale, offset, mean, var, epsilon_, output, future);
    return true;
  }


--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
@@ -14,8 +14,8 @@ template <DeviceType D, class T>
 class FoldedBatchNormOp : public Operator<D, T> {
 public:
  FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {
-    fused_relu_ = OperatorBase::GetSingleArgument<bool>("fused_relu", false);
+      : Operator<D, T>(operator_def, ws),
+        functor_(true, OperatorBase::GetSingleArgument<bool>("fused_relu", false)) {
  }

  bool Run(StatsFuture *future) override {
@@ -33,13 +33,11 @@ class FoldedBatchNormOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    output->ResizeLike(input);

-    functor_(input, scale, offset, nullptr, nullptr, 0,
-             true, fused_relu_, output, future);
+    functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
    return true;
  }

 private:
-  bool fused_relu_;
  kernels::BatchNormFunctor<D, T> functor_;

 protected:

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+
+class FoldedBatchNormOpTest : public OpsTestBase {};
+
+void CalculateScaleOffset(const std::vector<float> &gamma,
+                          const std::vector<float> &beta,
+                          const std::vector<float> &mean,
+                          const std::vector<float> &var,
+                          const float epsilon,
+                          std::vector<float> &scale,
+                          std::vector<float> &offset) {
+  size_t size = gamma.size();
+  for (int i = 0 ; i < size; ++i) {
+    scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
+    offset[i] = offset[i] - mean[i] * scale[i];
+  }
+}
+
+template <DeviceType D>
+void Simple() {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
+                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
+  std::vector<float> scale(1);
+  std::vector<float> offset(1);
+  CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, scale, offset);
+  net.AddInputFromArray<D, float>("Scale", {1}, scale);
+  net.AddInputFromArray<D, float>("Offset", {1}, offset);
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage",
+                            kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage",
+                            kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+
+  // Check
+  auto expected =
+      CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+                                         3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
+}
+
+TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
+
+/*
+TEST_F(FoldedBatchNormOpTest, SimpleNEON) {
+  Simple<DeviceType::NEON>();
+}
+*/
+
+TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
+
+/*
+TEST_F(FoldedBatchNormOpTest, SimpleRandomNeon) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 64;
+  index_t width = 64;
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Input("Epsilon")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
+width});
+  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
+  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run NEON
+  net.RunOp(DeviceType::NEON);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+}
+
+TEST_F(FoldedBatchNormOpTest, ComplexRandomNeon) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 103;
+  index_t width = 113;
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Input("Epsilon")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
+width});
+  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
+  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run NEON
+  net.RunOp(DeviceType::NEON);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+}
+*/
+
+TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 64;
+  index_t width = 64;
+
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
+                                           kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
+                                           kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
+                                           kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(DeviceType::OPENCL);
+  net.Sync();
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
+                                           kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
+}
+
+TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 64;
+  index_t width = 64;
+
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
+                                          kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
+                                          kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
+                                          kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(DeviceType::OPENCL);
+  net.Sync();
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
+                                           kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
+}
+
+TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 103;
+  index_t width = 113;
+
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
+                                           kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
+                                           kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
+                                           kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(DeviceType::OPENCL);
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
+                                           kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
+}
+
+TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 103;
+  index_t width = 113;
+
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
+                                          kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
+                                          kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
+                                          kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(DeviceType::OPENCL);
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
+                                           kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
+}
+}
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -39,6 +39,13 @@ def main(unused_args):
      f.write(str(output_graph_def))
  print("Model conversion is completed.")

+def str2bool(v):
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')

 def parse_args():
  """Parses command line arguments."""
@@ -91,7 +98,9 @@ def parse_args():
    help="template path")
  parser.add_argument(
    "--confuse",
-    type=bool,
+    type=str2bool,
+    nargs='?',
+    const=False,
    default=False,
    help="confuse model names")
  parser.add_argument(

--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -228,10 +228,19 @@ class TFConverter(object):
    arg = op_def.arg.add()
    arg.name = 'T'
    arg.i = self.dt
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
    op_def.name = op.name
    op_def.type = 'FoldedBatchNorm'
+
    gamma_tensor = get_input_tensor(op, 1)
-    gamma_value = gamma_tensor.eval().astype(np.float32)
+    for i in range(1, 5):
+      input_tensor = get_input_tensor(op, i)
+      assert input_tensor.shape == gamma_tensor.shape
+      self.unused_tensor.add(input_tensor.name)
+
+    gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
    beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
    mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
    var_value = get_input_tensor(op, 4).eval().astype(np.float32)
@@ -241,9 +250,8 @@ class TFConverter(object):
      (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
      gamma_value)
    offset_value = (-mean_value * scale_value) + beta_value
-    name_prefix = op.inputs[1].name
-    idx = name_prefix.rfind('/')
-    name_prefix = op.inputs[1].name[:idx] + '/'
+    idx = gamma_tensor.name.rfind('/')
+    name_prefix = gamma_tensor.name[:idx] + '/'
    input_names = [name_prefix+'scale:0', name_prefix+'offset:0']
    self.add_tensor(input_names[0], gamma_value.shape,
      gamma_tensor.dtype, scale_value)
@@ -259,8 +267,8 @@ class TFConverter(object):
      op_def.input.extend([input.name for input in input_names])

    self.resolved_ops[op.name] = 1
-
    final_op = op
+
    if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Relu':
      relu_op = self.tf_graph[op.name][0]
      final_op = relu_op
@@ -272,9 +280,6 @@ class TFConverter(object):
    op_def.output.extend([final_op.outputs[0].name])
    self.add_output_shape(final_op.outputs, op_def)

-    data_format_arg = op_def.arg.add()
-    data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NHWC'
    self.net_def.op.extend([op_def])

  def convert_batchnorm(self, op):