diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 341493fcadfad04d099e183b426a0368df45d288..e759d89d6ec7d187ed50b688c84c6bd4dd8f1ddb 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -75,6 +75,7 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
 extern void Register_Relu(OperatorRegistry *op_registry);
 extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
 extern void Register_SpaceToBatchND(OperatorRegistry *op_registry);
+extern void Register_Softmax(OperatorRegistry *op_registry);
 
 OperatorRegistry::OperatorRegistry() {
   Register_AddN(this);
@@ -93,6 +94,7 @@ OperatorRegistry::OperatorRegistry() {
   Register_Relu(this);
   Register_ResizeBilinear(this);
   Register_SpaceToBatchND(this);
+  Register_Softmax(this);
 }
 
 }  // namespace mace
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 59ebfd8f6e6f54906d27c1587df2911b36fb0845..391ce05aace61c99573e9e1ac221c00b4efd7ed3 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -11,10 +11,9 @@
 namespace mace {
 namespace kernels {
 
-struct AddNFunctorBase {};
 
 template <DeviceType D, typename T>
-struct AddNFunctor : AddNFunctorBase {
+struct AddNFunctor {
   void operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor, StatsFuture *future) {
     output_tensor->ResizeLike(input_tensors[0]);
@@ -24,10 +23,6 @@ struct AddNFunctor : AddNFunctorBase {
     memset(output_ptr, 0, size * sizeof(T));
     int n = input_tensors.size();
     for (int i = 0; i < n; ++i) {
-      MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
-      MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
-      MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
-      MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
       Tensor::MappingGuard input_map(input_tensors[i]);
       const T *input_ptr = input_tensors[i]->data<T>();
       for (index_t j = 0; j < size; ++j) {
@@ -44,7 +39,7 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
     StatsFuture *future);
 
 template <typename T>
-struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
+struct AddNFunctor<DeviceType::OPENCL, T> {
   void operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor, StatsFuture *future);
 };
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index d138aca3bd04248510859d6bd4b4ffc29f83f11d..514f0d2adc8f790074de789f34b12a7f8baef131 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -17,6 +17,8 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
   if (input_tensors.size() > 4) {
     MACE_NOT_IMPLEMENTED;
   }
+  output->ResizeLike(input_tensors[0]);
+
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -36,8 +38,8 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
 
   uint32_t idx = 0;
   for (auto input : input_tensors) {
-  addn_kernel.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(input->buffer())));
+    addn_kernel.setArg(idx++,
+                       *(static_cast<const cl::Image2D *>(input->buffer())));
   }
   addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
 
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
new file mode 100644
index 0000000000000000000000000000000000000000..0188d7679153c18b347e7691a3b30d9e350e6ef5
--- /dev/null
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -0,0 +1,77 @@
+#include <common.h>
+
+__kernel void softmax(__read_only image2d_t input,
+                      __private const int channels,
+                      __private const int remain_channels,
+                      __write_only image2d_t output) {
+  const int chan_blk_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int hb_idx = get_global_id(2);
+  const int chan_blks = get_global_size(0) - 1;
+  const int width = get_global_size(1);
+
+  int pos = width_idx;
+  DATA_TYPE max_value = -FLT_MAX;
+  DATA_TYPE sum = 0.0;
+  DATA_TYPE4 data;
+  for (short i = 0; i < chan_blks; ++i) {
+    data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
+    max_value = max(max_value, data.x);
+    max_value = max(max_value, data.y);
+    max_value = max(max_value, data.z);
+    max_value = max(max_value, data.w);
+    pos += width;
+  }
+  data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
+  switch(remain_channels) {
+    case 0:
+      max_value = max(max_value, data.w);
+    case 1:
+      max_value = max(max_value, data.z);
+    case 2:
+      max_value = max(max_value, data.y);
+    case 3:
+      max_value = max(max_value, data.x);
+  }
+
+  pos = width_idx;
+  for (short i = 0; i < chan_blks; ++i) {
+    data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
+    data = native_exp(data - max_value);
+    sum += data.x;
+    sum += data.y;
+    sum += data.z;
+    sum += data.w;
+    pos += width;
+  }
+  data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
+  data -= max_value;
+  switch(remain_channels) {
+    case 0:
+      sum += native_exp(data.w);
+    case 1:
+      sum += native_exp(data.z);
+    case 2:
+      sum += native_exp(data.y);
+    case 3:
+      sum += native_exp(data.x);
+  }
+
+  pos = mad24(chan_blk_idx, width, width_idx);
+  data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
+  data -= max_value;
+  const int exceeded = mul24(chan_blk_idx, 4) - channels;
+  switch(exceeded) {
+    case 1:
+      data.z = native_exp(data.z) / sum;
+    case 2:
+      data.y = native_exp(data.y) / sum;
+    case 3:
+      data.x = native_exp(data.x) / sum;
+      break;
+    default:
+      data = native_exp(data) / sum;
+  }
+
+  WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
+}
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index c9d5a7f18a03f419c0a1686dfa98b070f6ff7597..7561b1fa5a59d9b5f351f0f5c1a4403d401b8c1c 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -35,15 +35,15 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
 
     uint32_t idx = 0;
-    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+    relu_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    relu_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
   } else {
     relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
 
     uint32_t idx = 0;
-    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+    relu_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
     relu_kernel.setArg(idx++, max_limit_);
-    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+    relu_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
   }
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..147e53d54af4b155917b1b8bd065e7766bc324f0
--- /dev/null
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -0,0 +1,106 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/softmax.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+namespace kernels {
+
+template<typename T>
+void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
+                                                       Tensor *output,
+                                                       StatsFuture *future) {
+  const index_t batch = logits->dim(0);
+  const index_t height = logits->dim(1);
+  const index_t width = logits->dim(2);
+  const index_t channels = logits->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+
+  auto runtime = OpenCLRuntime::Global();
+
+  std::set<std::string> built_options;
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  cl::Kernel softmax_kernel = runtime->BuildKernel("softmax", "softmax", built_options);
+
+  uint32_t idx = 0;
+  softmax_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(logits->buffer())));
+  softmax_kernel.setArg(idx++, static_cast<int>(channels));
+  softmax_kernel.setArg(idx++, remain_channels);
+  softmax_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
+    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  cl::Event event;
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        softmax_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        nullptr, &event);
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "softmax_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func,
+                                                     &timer);
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+}
+
+template
+struct SoftmaxFunctor<DeviceType::OPENCL, float>;
+template
+struct SoftmaxFunctor<DeviceType::OPENCL, half>;
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..c686c60cec60098200e7da0db31472321dbe41ae
--- /dev/null
+++ b/mace/kernels/softmax.h
@@ -0,0 +1,63 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_SOFTMAX_H_
+#define MACE_KERNELS_SOFTMAX_H_
+
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/core/public/mace.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+struct SoftmaxFunctor {
+  void operator()(const Tensor *logits,
+                  Tensor *output,
+                  StatsFuture *future) {
+
+    Tensor::MappingGuard logits_guard(logits);
+    Tensor::MappingGuard output_guard(output);
+    const T *logits_ptr = logits->data<T>();
+    T *output_ptr = output->mutable_data<T>();
+    auto &logits_shape = logits->shape();
+    const index_t batch_size = std::accumulate(logits_shape.begin(), logits_shape.end()-1,
+                                               1, std::multiplies<index_t>());
+    const index_t num_classes = logits_shape.back();
+#pragma omp parallel for
+    for (index_t i = 0; i < batch_size; ++i) {
+      T max_value = *logits_ptr;
+      for (index_t c = 1; c < num_classes; ++c) {
+        max_value = std::max(max_value, logits_ptr[c]);
+      }
+      // TODO: check overflow?
+      T sum = 0;
+      std::vector<T> exp_data(num_classes);
+      for (index_t c = 0; c < num_classes; ++c) {
+        exp_data[c] = ::exp((*logits_ptr - max_value));
+        sum += exp_data[c];
+        logits_ptr++;
+      }
+      for (index_t c = 0; c < num_classes; ++c) {
+        *output_ptr = exp_data[c] / sum;
+        output_ptr++;
+      }
+    }
+  }
+};
+
+
+template<typename T>
+struct SoftmaxFunctor<DeviceType::OPENCL, T> {
+
+  void operator()(const Tensor *logits,
+                  Tensor *output,
+                  StatsFuture *future);
+};
+
+}  //  namepsace kernels
+}  //  namespace mace
+
+#endif  //  MACE_KERNELS_SOFTMAX_H_
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index fc984c3bd3efe27afe96418b84817682c1b6f61f..7bff94344891b6a48dc21fe9e4e1bf3a8d3f55fb 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -17,11 +17,14 @@ class AddNOp : public Operator<D, T> {
       : Operator<D, T>(operator_def, ws) {}
 
   bool Run(StatsFuture *future) override {
-    Tensor *output_tensor = this->outputs_[0];
+    Tensor *output_tensor = this->Output(0);
     int n = this->inputs_.size();
     vector<const Tensor *> inputs(n, nullptr);
-    for (int i = 0; i < n; ++i) {
-      inputs[i] = this->inputs_[i];
+    inputs[0] = this->Input(0);
+    for (int i = 1; i < n; ++i) {
+      inputs[i] = this->Input(i);
+      MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
+      MACE_CHECK(inputs[0]->size() == inputs[i]->size());
     }
 
     functor_(inputs, output_tensor, future);
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac48b3d85901c7b99ca3abb9c0d185e4e5da2349
--- /dev/null
+++ b/mace/ops/softmax.cc
@@ -0,0 +1,29 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/softmax.h"
+
+namespace mace {
+
+void Register_Softmax(OperatorRegistry *op_registry) {
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
+                                     .Device(DeviceType::CPU)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SoftmaxOp<DeviceType::CPU, float>);
+
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SoftmaxOp<DeviceType::OPENCL, float>);
+
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
+                                    .Device(DeviceType::OPENCL)
+                                    .TypeConstraint<half>("T")
+                                    .Build(),
+                    SoftmaxOp<DeviceType::OPENCL, half>);
+}
+
+}  //  namespace mace
diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..26914c6932396b2ed28a6243df66bced6805ddf1
--- /dev/null
+++ b/mace/ops/softmax.h
@@ -0,0 +1,40 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_SOFTMAX_H_
+#define MACE_SOFTMAX_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/softmax.h"
+
+namespace mace {
+
+template <DeviceType D, class T>
+class SoftmaxOp : public Operator<D, T> {
+ public:
+  SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<D, T>(operator_def, ws) {
+  }
+
+  bool Run(StatsFuture *future) override {
+    const Tensor *logits= this->Input(LOGITS);
+
+    Tensor *output = this->Output(OUTPUT);
+    output->ResizeLike(logits);
+
+    functor_(logits, output, future);
+    return true;
+  }
+
+ private:
+  kernels::SoftmaxFunctor<D, T> functor_;
+
+ protected:
+  OP_INPUT_TAGS(LOGITS);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  //  namespace mace
+
+#endif  //  MACE_SOFTMAX_H_
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..030af807ca9d186551752116763c3ee7598ab9e6
--- /dev/null
+++ b/mace/ops/softmax_benchmark.cc
@@ -0,0 +1,67 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <string>
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+template <DeviceType D, typename T>
+static void SoftmaxBenchmark(
+    int iters, int batch, int channels, int height, int width) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Softmax", "SoftmaxBM")
+        .Input("InputImage")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("Softmax", "SoftmaxBM")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+
+#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                      \
+  static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) {  \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
+    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
+    SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                  \
+  }                                                                  \
+  BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
+
+#define BM_SOFTMAX(N, C, H, W, TYPE)        \
+  BM_SOFTMAX_MACRO(N, C, H, W, TYPE, CPU);  \
+  BM_SOFTMAX_MACRO(N, C, H, W, TYPE, OPENCL);
+
+BM_SOFTMAX(1, 1, 512, 512, float);
+BM_SOFTMAX(1, 3, 128, 128, float);
+BM_SOFTMAX(1, 3, 512, 512, float);
+BM_SOFTMAX(1, 32, 112, 112, float);
+BM_SOFTMAX(1, 64, 256, 256, float);
+}  //  namespace mace
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f321a6d86ec0c7b7db7633a7ad6d43ada0d916
--- /dev/null
+++ b/mace/ops/softmax_test.cc
@@ -0,0 +1,107 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+
+class SoftmaxOpTest : public OpsTestBase {};
+
+template <DeviceType D>
+void Simple() {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Softmax", "SoftmaxTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Softmax", "SoftmaxTest")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
+
+  auto expected = CreateTensor<float>({1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25,
+                                                     0.0320586, 0.08714432, 0.23688282, 0.64391426});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
+}
+
+TEST_F(SoftmaxOpTest, CPUSimple) {
+  Simple<DeviceType::CPU>();
+}
+TEST_F(SoftmaxOpTest, OPENCLSimple) {
+  Simple<DeviceType::OPENCL>();
+}
+
+template <DeviceType D>
+void Complex(const std::vector<index_t> &logits_shape) {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", logits_shape);
+
+  OpDefBuilder("Softmax", "SoftmaxTest")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Run on cpu
+  net.RunOp();
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  BufferToImage<D, float>(net, "Input", "InputImage",
+                          kernels::BufferType::IN_OUT);
+
+  OpDefBuilder("Softmax", "SoftmaxTest")
+      .Input("InputImage")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+
+  // Run on gpu
+  net.RunOp(D);
+
+  // Transfer output
+  ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                          kernels::BufferType::IN_OUT);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+}
+
+TEST_F(SoftmaxOpTest, OPENCLAligned) {
+  Complex<DeviceType::OPENCL>({1, 256, 256, 3});
+  Complex<DeviceType::OPENCL>({1, 128, 128, 16});
+}
+
+TEST_F(SoftmaxOpTest, OPENCLMulBatchAligned) {
+  Complex<DeviceType::OPENCL>({5, 64, 64, 3});
+  Complex<DeviceType::OPENCL>({8, 128, 128, 8});
+}
+
+TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
+  Complex<DeviceType::OPENCL>({1, 113, 107, 13});
+  Complex<DeviceType::OPENCL>({5, 211, 107, 1});
+}
+
+}  // namespace mace
diff --git a/mace/proto/BUILD b/mace/proto/BUILD
index d46aa81266f71c5847080cf2b59369b3305bb467..5222b06bda6e1681b15ac7f60317376c5d34fa3d 100644
--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
@@ -18,3 +18,12 @@ py_proto_library(
     srcs_version = "PY2AND3",
     deps = ["@com_google_protobuf//:protobuf_python"],
 )
+
+py_proto_library(
+    name = "caffe_py",
+    srcs = ["caffe.proto"],
+    default_runtime = "@com_google_protobuf//:protobuf_python",
+    protoc = "@com_google_protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_protobuf//:protobuf_python"],
+)
diff --git a/mace/proto/caffe.proto b/mace/proto/caffe.proto
new file mode 100644
index 0000000000000000000000000000000000000000..f1f99e5eba428ab9d7159e49b7ff6256323ea719
--- /dev/null
+++ b/mace/proto/caffe.proto
@@ -0,0 +1,1459 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 41 (last added: type)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, all states will have solver = true;
+  // train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38;
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional ParameterParameter parameter_param = 145;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PSROIPoolingParameter psroi_pooling_param = 149;
+  optional PSROIAlignParameter psroi_align_param = 1490;
+  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ROIPoolingParameter roi_pooling_param = 8266711;
+  optional ScaleParameter scale_param = 142;
+  optional ProposalParameter proposal_param = 8266713;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional WindowDataParameter window_data_param = 129;
+
+  optional NNPACKConvolutionParameter nnpack_convolution_param = 204;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss layer.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message BatchNormParameter {
+  // If false, accumulate global mean/variance values via a moving average. If
+  // true, use those accumulated values instead of computing mean/variance
+  // across the batch.
+  optional bool use_global_stats = 1;
+  // How much does the moving average decay each iteration?
+  optional float moving_average_fraction = 2 [default = .999];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [default = 1e-5];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+    NNPACK = 3;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [default = 2];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
+
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    NNPACK = 2;
+  }
+  optional Engine engine = 7 [default = DEFAULT];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+    NNPACK = 3;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+message PSROIPoolingParameter {
+   required float spatial_scale = 1; 
+   required int32 output_dim = 2; // output channel number
+   required int32 group_size = 3; // number of groups to encode position-sensitive score maps
+}
+message PSROIAlignParameter {
+   required float spatial_scale = 1; 
+   required int32 output_dim = 2; // output channel number
+   required int32 group_size = 3; // number of groups to encode position-sensitive score maps
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+// Message that stores parameters used by ROIPoolingLayer
+message ROIPoolingParameter {
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
+  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
+  // Multiplicative spatial scale factor to translate ROI coords from their
+  // input scale to the scale used when pooling
+  optional float spatial_scale = 3 [default = 1];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
+// Message that stores parameters used by ProposalLayer
+message ProposalParameter {
+  optional uint32 feat_stride = 1 [default = 16]; 
+  repeated uint32 scales = 2; 
+  repeated float ratios = 3; 
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
+
+message NNPACKConvolutionParameter {
+  enum Algorithm {
+    AUTO = 0;
+    WINOGRAD = 1;
+    FFT_16x16 = 2;
+    FFT_8x8 = 3;
+  }
+  optional Algorithm algorithm = 1 [default=AUTO];
+  enum KernelTransformStrategy {
+    RECOMPUTE = 0;
+    REUSE = 1;
+  }
+  optional KernelTransformStrategy kernel_transform_strategy = 2 [default=RECOMPUTE];
+}
+
diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD
index f3a75c21f9b82789567a70bbb2cb137d349b0e63..ad3944b92fa826eacc18b8840f853c37f15b3f41 100644
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -43,3 +43,12 @@ py_binary(
         "//mace/proto:mace_py",
     ],
 )
+
+py_binary(
+    name = "caffe_ops_stats",
+    srcs = ["caffe_ops_stats.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//mace/proto:caffe_py",
+    ],
+)
diff --git a/mace/python/tools/caffe_ops_stats.py b/mace/python/tools/caffe_ops_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3bb7c45e44bb5973f910127a89b7b1963143f7
--- /dev/null
+++ b/mace/python/tools/caffe_ops_stats.py
@@ -0,0 +1,38 @@
+from mace.proto import caffe_pb2
+import google.protobuf.text_format
+import operator
+import functools
+import argparse
+import sys
+import six
+
+FLAGS = None
+
+def main(unused_args):
+  net = caffe_pb2.NetParameter()
+  with open(FLAGS.input) as f:
+    google.protobuf.text_format.Merge(str(f.read()), net)
+
+  ops = {}
+  for layer in net.layer:
+    if layer.type not in ops:
+      ops[layer.type] = 1
+    else:
+      ops[layer.type] += 1
+
+  for key, value in sorted(ops.items(), key=operator.itemgetter(1)):
+    print key, ":", value
+
+def parse_args():
+  '''Parses command line arguments.'''
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    '--input',
+    type=str,
+    default='',
+    help='Caffe \'GraphDef\' file to load.')
+  return parser.parse_known_args()
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/mace/python/tools/mace_pb2.py b/mace/python/tools/mace_pb2.py
deleted file mode 100755
index dd3d25b6e564aa0adcb6b66555d33db4051ae5f3..0000000000000000000000000000000000000000
--- a/mace/python/tools/mace_pb2.py
+++ /dev/null
@@ -1,896 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: mace/proto/mace.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='mace/proto/mace.proto',
-  package='mace',
-  syntax='proto2',
-  serialized_pb=_b('\n\x15mace/proto/mace.proto\x12\x04mace\"\xf0\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x03\x12+\n\tdata_type\x18\x02 \x01(\x0e\x32\x0e.mace.DataType:\x08\x44T_FLOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x17\n\x0b\x64ouble_data\x18\t \x03(\x01\x42\x02\x10\x01\x12\x16\n\nint64_data\x18\n \x03(\x03\x42\x02\x10\x01\x12\x0c\n\x04name\x18\x07 \x01(\t\x12\x0f\n\x07node_id\x18\x64 \x01(\r\"h\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\x0c\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x03\x12\x0f\n\x07strings\x18\x07 \x03(\x0c\"1\n\tNodeInput\x12\x0f\n\x07node_id\x18\x01 \x01(\x05\x12\x13\n\x0boutput_port\x18\x02 \x01(\x05\"\x1b\n\x0bOutputShape\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x03\"\xb8\x02\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1b\n\x03\x61rg\x18\x05 \x03(\x0b\x32\x0e.mace.Argument\x12\'\n\x0coutput_shape\x18\x06 \x03(\x0b\x32\x11.mace.OutputShape\x12#\n\x0boutput_type\x18\x07 \x03(\x0e\x32\x0e.mace.DataType\x12\x12\n\x06mem_id\x18\n \x01(\x05:\x02-1\x12\x0f\n\x07node_id\x18\x64 \x01(\r\x12\r\n\x05op_id\x18\x65 \x01(\r\x12\x0f\n\x07padding\x18\x66 \x01(\r\x12#\n\nnode_input\x18g \x03(\x0b\x32\x0f.mace.NodeInput\x12\x19\n\x11out_max_byte_size\x18h \x03(\x05\"3\n\x0bMemoryBlock\x12\x0e\n\x06mem_id\x18\x01 \x01(\x05\x12\t\n\x01x\x18\x02 \x01(\r\x12\t\n\x01y\x18\x03 \x01(\r\"3\n\x0bMemoryArena\x12$\n\tmem_block\x18\x01 \x03(\x0b\x32\x11.mace.MemoryBlock\"|\n\tInputInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07node_id\x18\x02 \x01(\x05\x12\x0c\n\x04\x64ims\x18\x03 \x03(\x05\x12\x15\n\rmax_byte_size\x18\x04 \x01(\x05\x12+\n\tdata_type\x18\x05 \x01(\x0e\x32\x0e.mace.DataType:\x08\x44T_FLOAT\"}\n\nOutputInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07node_id\x18\x02 \x01(\x05\x12\x0c\n\x04\x64ims\x18\x03 \x03(\x05\x12\x15\n\rmax_byte_size\x18\x04 \x01(\x05\x12+\n\tdata_type\x18\x05 \x01(\x0e\x32\x0e.mace.DataType:\x08\x44T_FLOAT\"\xf9\x01\n\x06NetDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x02op\x18\x02 \x03(\x0b\x32\x11.mace.OperatorDef\x12\x0f\n\x07version\x18\x03 \x01(\t\x12\x1b\n\x03\x61rg\x18\x04 \x03(\x0b\x32\x0e.mace.Argument\x12\"\n\x07tensors\x18\x05 \x03(\x0b\x32\x11.mace.TensorProto\x12$\n\tmem_arena\x18\n \x01(\x0b\x32\x11.mace.MemoryArena\x12#\n\ninput_info\x18\x64 \x03(\x0b\x32\x0f.mace.InputInfo\x12%\n\x0boutput_info\x18\x65 \x03(\x0b\x32\x10.mace.OutputInfo*\x1f\n\x07NetMode\x12\x08\n\x04INIT\x10\x00\x12\n\n\x06NORMAL\x10\x01*+\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04NEON\x10\x01\x12\n\n\x06OPENCL\x10\x02*\xc3\x01\n\x08\x44\x61taType\x12\x0e\n\nDT_INVALID\x10\x00\x12\x0c\n\x08\x44T_FLOAT\x10\x01\x12\r\n\tDT_DOUBLE\x10\x02\x12\x0c\n\x08\x44T_INT32\x10\x03\x12\x0c\n\x08\x44T_UINT8\x10\x04\x12\x0c\n\x08\x44T_INT16\x10\x05\x12\x0b\n\x07\x44T_INT8\x10\x06\x12\r\n\tDT_STRING\x10\x07\x12\x0c\n\x08\x44T_INT64\x10\x08\x12\r\n\tDT_UINT16\x10\t\x12\x0b\n\x07\x44T_BOOL\x10\n\x12\x0b\n\x07\x44T_HALF\x10\x13\x12\r\n\tDT_UINT32\x10\x16')
-)
-
-_NETMODE = _descriptor.EnumDescriptor(
-  name='NetMode',
-  full_name='mace.NetMode',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='INIT', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='NORMAL', index=1, number=1,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=1386,
-  serialized_end=1417,
-)
-_sym_db.RegisterEnumDescriptor(_NETMODE)
-
-NetMode = enum_type_wrapper.EnumTypeWrapper(_NETMODE)
-_DEVICETYPE = _descriptor.EnumDescriptor(
-  name='DeviceType',
-  full_name='mace.DeviceType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='CPU', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='NEON', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='OPENCL', index=2, number=2,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=1419,
-  serialized_end=1462,
-)
-_sym_db.RegisterEnumDescriptor(_DEVICETYPE)
-
-DeviceType = enum_type_wrapper.EnumTypeWrapper(_DEVICETYPE)
-_DATATYPE = _descriptor.EnumDescriptor(
-  name='DataType',
-  full_name='mace.DataType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='DT_INVALID', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_FLOAT', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_DOUBLE', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_INT32', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_UINT8', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_INT16', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_INT8', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_STRING', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_INT64', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_UINT16', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_BOOL', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_HALF', index=11, number=19,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='DT_UINT32', index=12, number=22,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=1465,
-  serialized_end=1660,
-)
-_sym_db.RegisterEnumDescriptor(_DATATYPE)
-
-DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE)
-INIT = 0
-NORMAL = 1
-CPU = 0
-NEON = 1
-OPENCL = 2
-DT_INVALID = 0
-DT_FLOAT = 1
-DT_DOUBLE = 2
-DT_INT32 = 3
-DT_UINT8 = 4
-DT_INT16 = 5
-DT_INT8 = 6
-DT_STRING = 7
-DT_INT64 = 8
-DT_UINT16 = 9
-DT_BOOL = 10
-DT_HALF = 19
-DT_UINT32 = 22
-
-
-
-_TENSORPROTO = _descriptor.Descriptor(
-  name='TensorProto',
-  full_name='mace.TensorProto',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='mace.TensorProto.dims', index=0,
-      number=1, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='mace.TensorProto.data_type', index=1,
-      number=2, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='float_data', full_name='mace.TensorProto.float_data', index=2,
-      number=3, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))),
-    _descriptor.FieldDescriptor(
-      name='int32_data', full_name='mace.TensorProto.int32_data', index=3,
-      number=4, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))),
-    _descriptor.FieldDescriptor(
-      name='byte_data', full_name='mace.TensorProto.byte_data', index=4,
-      number=5, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='string_data', full_name='mace.TensorProto.string_data', index=5,
-      number=6, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='double_data', full_name='mace.TensorProto.double_data', index=6,
-      number=9, type=1, cpp_type=5, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))),
-    _descriptor.FieldDescriptor(
-      name='int64_data', full_name='mace.TensorProto.int64_data', index=7,
-      number=10, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))),
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.TensorProto.name', index=8,
-      number=7, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='node_id', full_name='mace.TensorProto.node_id', index=9,
-      number=100, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=32,
-  serialized_end=272,
-)
-
-
-_ARGUMENT = _descriptor.Descriptor(
-  name='Argument',
-  full_name='mace.Argument',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.Argument.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='f', full_name='mace.Argument.f', index=1,
-      number=2, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='i', full_name='mace.Argument.i', index=2,
-      number=3, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='s', full_name='mace.Argument.s', index=3,
-      number=4, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='floats', full_name='mace.Argument.floats', index=4,
-      number=5, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ints', full_name='mace.Argument.ints', index=5,
-      number=6, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='strings', full_name='mace.Argument.strings', index=6,
-      number=7, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=274,
-  serialized_end=378,
-)
-
-
-_NODEINPUT = _descriptor.Descriptor(
-  name='NodeInput',
-  full_name='mace.NodeInput',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='node_id', full_name='mace.NodeInput.node_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='output_port', full_name='mace.NodeInput.output_port', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=380,
-  serialized_end=429,
-)
-
-
-_OUTPUTSHAPE = _descriptor.Descriptor(
-  name='OutputShape',
-  full_name='mace.OutputShape',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='mace.OutputShape.dims', index=0,
-      number=1, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=431,
-  serialized_end=458,
-)
-
-
-_OPERATORDEF = _descriptor.Descriptor(
-  name='OperatorDef',
-  full_name='mace.OperatorDef',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='input', full_name='mace.OperatorDef.input', index=0,
-      number=1, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='output', full_name='mace.OperatorDef.output', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.OperatorDef.name', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='mace.OperatorDef.type', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='arg', full_name='mace.OperatorDef.arg', index=4,
-      number=5, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='output_shape', full_name='mace.OperatorDef.output_shape', index=5,
-      number=6, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='output_type', full_name='mace.OperatorDef.output_type', index=6,
-      number=7, type=14, cpp_type=8, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='mem_id', full_name='mace.OperatorDef.mem_id', index=7,
-      number=10, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=-1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='node_id', full_name='mace.OperatorDef.node_id', index=8,
-      number=100, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='op_id', full_name='mace.OperatorDef.op_id', index=9,
-      number=101, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='padding', full_name='mace.OperatorDef.padding', index=10,
-      number=102, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='node_input', full_name='mace.OperatorDef.node_input', index=11,
-      number=103, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='out_max_byte_size', full_name='mace.OperatorDef.out_max_byte_size', index=12,
-      number=104, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=461,
-  serialized_end=773,
-)
-
-
-_MEMORYBLOCK = _descriptor.Descriptor(
-  name='MemoryBlock',
-  full_name='mace.MemoryBlock',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='mem_id', full_name='mace.MemoryBlock.mem_id', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='x', full_name='mace.MemoryBlock.x', index=1,
-      number=2, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='y', full_name='mace.MemoryBlock.y', index=2,
-      number=3, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=775,
-  serialized_end=826,
-)
-
-
-_MEMORYARENA = _descriptor.Descriptor(
-  name='MemoryArena',
-  full_name='mace.MemoryArena',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='mem_block', full_name='mace.MemoryArena.mem_block', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=828,
-  serialized_end=879,
-)
-
-
-_INPUTINFO = _descriptor.Descriptor(
-  name='InputInfo',
-  full_name='mace.InputInfo',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.InputInfo.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='node_id', full_name='mace.InputInfo.node_id', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='mace.InputInfo.dims', index=2,
-      number=3, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='max_byte_size', full_name='mace.InputInfo.max_byte_size', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='mace.InputInfo.data_type', index=4,
-      number=5, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=881,
-  serialized_end=1005,
-)
-
-
-_OUTPUTINFO = _descriptor.Descriptor(
-  name='OutputInfo',
-  full_name='mace.OutputInfo',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.OutputInfo.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='node_id', full_name='mace.OutputInfo.node_id', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='mace.OutputInfo.dims', index=2,
-      number=3, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='max_byte_size', full_name='mace.OutputInfo.max_byte_size', index=3,
-      number=4, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='mace.OutputInfo.data_type', index=4,
-      number=5, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1007,
-  serialized_end=1132,
-)
-
-
-_NETDEF = _descriptor.Descriptor(
-  name='NetDef',
-  full_name='mace.NetDef',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='mace.NetDef.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='op', full_name='mace.NetDef.op', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='version', full_name='mace.NetDef.version', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='arg', full_name='mace.NetDef.arg', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tensors', full_name='mace.NetDef.tensors', index=4,
-      number=5, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='mem_arena', full_name='mace.NetDef.mem_arena', index=5,
-      number=10, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='input_info', full_name='mace.NetDef.input_info', index=6,
-      number=100, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='output_info', full_name='mace.NetDef.output_info', index=7,
-      number=101, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1135,
-  serialized_end=1384,
-)
-
-_TENSORPROTO.fields_by_name['data_type'].enum_type = _DATATYPE
-_OPERATORDEF.fields_by_name['arg'].message_type = _ARGUMENT
-_OPERATORDEF.fields_by_name['output_shape'].message_type = _OUTPUTSHAPE
-_OPERATORDEF.fields_by_name['output_type'].enum_type = _DATATYPE
-_OPERATORDEF.fields_by_name['node_input'].message_type = _NODEINPUT
-_MEMORYARENA.fields_by_name['mem_block'].message_type = _MEMORYBLOCK
-_INPUTINFO.fields_by_name['data_type'].enum_type = _DATATYPE
-_OUTPUTINFO.fields_by_name['data_type'].enum_type = _DATATYPE
-_NETDEF.fields_by_name['op'].message_type = _OPERATORDEF
-_NETDEF.fields_by_name['arg'].message_type = _ARGUMENT
-_NETDEF.fields_by_name['tensors'].message_type = _TENSORPROTO
-_NETDEF.fields_by_name['mem_arena'].message_type = _MEMORYARENA
-_NETDEF.fields_by_name['input_info'].message_type = _INPUTINFO
-_NETDEF.fields_by_name['output_info'].message_type = _OUTPUTINFO
-DESCRIPTOR.message_types_by_name['TensorProto'] = _TENSORPROTO
-DESCRIPTOR.message_types_by_name['Argument'] = _ARGUMENT
-DESCRIPTOR.message_types_by_name['NodeInput'] = _NODEINPUT
-DESCRIPTOR.message_types_by_name['OutputShape'] = _OUTPUTSHAPE
-DESCRIPTOR.message_types_by_name['OperatorDef'] = _OPERATORDEF
-DESCRIPTOR.message_types_by_name['MemoryBlock'] = _MEMORYBLOCK
-DESCRIPTOR.message_types_by_name['MemoryArena'] = _MEMORYARENA
-DESCRIPTOR.message_types_by_name['InputInfo'] = _INPUTINFO
-DESCRIPTOR.message_types_by_name['OutputInfo'] = _OUTPUTINFO
-DESCRIPTOR.message_types_by_name['NetDef'] = _NETDEF
-DESCRIPTOR.enum_types_by_name['NetMode'] = _NETMODE
-DESCRIPTOR.enum_types_by_name['DeviceType'] = _DEVICETYPE
-DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-TensorProto = _reflection.GeneratedProtocolMessageType('TensorProto', (_message.Message,), dict(
-  DESCRIPTOR = _TENSORPROTO,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.TensorProto)
-  ))
-_sym_db.RegisterMessage(TensorProto)
-
-Argument = _reflection.GeneratedProtocolMessageType('Argument', (_message.Message,), dict(
-  DESCRIPTOR = _ARGUMENT,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.Argument)
-  ))
-_sym_db.RegisterMessage(Argument)
-
-NodeInput = _reflection.GeneratedProtocolMessageType('NodeInput', (_message.Message,), dict(
-  DESCRIPTOR = _NODEINPUT,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.NodeInput)
-  ))
-_sym_db.RegisterMessage(NodeInput)
-
-OutputShape = _reflection.GeneratedProtocolMessageType('OutputShape', (_message.Message,), dict(
-  DESCRIPTOR = _OUTPUTSHAPE,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.OutputShape)
-  ))
-_sym_db.RegisterMessage(OutputShape)
-
-OperatorDef = _reflection.GeneratedProtocolMessageType('OperatorDef', (_message.Message,), dict(
-  DESCRIPTOR = _OPERATORDEF,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.OperatorDef)
-  ))
-_sym_db.RegisterMessage(OperatorDef)
-
-MemoryBlock = _reflection.GeneratedProtocolMessageType('MemoryBlock', (_message.Message,), dict(
-  DESCRIPTOR = _MEMORYBLOCK,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.MemoryBlock)
-  ))
-_sym_db.RegisterMessage(MemoryBlock)
-
-MemoryArena = _reflection.GeneratedProtocolMessageType('MemoryArena', (_message.Message,), dict(
-  DESCRIPTOR = _MEMORYARENA,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.MemoryArena)
-  ))
-_sym_db.RegisterMessage(MemoryArena)
-
-InputInfo = _reflection.GeneratedProtocolMessageType('InputInfo', (_message.Message,), dict(
-  DESCRIPTOR = _INPUTINFO,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.InputInfo)
-  ))
-_sym_db.RegisterMessage(InputInfo)
-
-OutputInfo = _reflection.GeneratedProtocolMessageType('OutputInfo', (_message.Message,), dict(
-  DESCRIPTOR = _OUTPUTINFO,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.OutputInfo)
-  ))
-_sym_db.RegisterMessage(OutputInfo)
-
-NetDef = _reflection.GeneratedProtocolMessageType('NetDef', (_message.Message,), dict(
-  DESCRIPTOR = _NETDEF,
-  __module__ = 'mace.proto.mace_pb2'
-  # @@protoc_insertion_point(class_scope:mace.NetDef)
-  ))
-_sym_db.RegisterMessage(NetDef)
-
-
-_TENSORPROTO.fields_by_name['float_data'].has_options = True
-_TENSORPROTO.fields_by_name['float_data']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
-_TENSORPROTO.fields_by_name['int32_data'].has_options = True
-_TENSORPROTO.fields_by_name['int32_data']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
-_TENSORPROTO.fields_by_name['double_data'].has_options = True
-_TENSORPROTO.fields_by_name['double_data']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
-_TENSORPROTO.fields_by_name['int64_data'].has_options = True
-_TENSORPROTO.fields_by_name['int64_data']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
-# @@protoc_insertion_point(module_scope)
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index e224b6112234464df3f4b5411303d9cff0c36579..006d41fe8851637a731e15e22c0b9cc36e69be13 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -43,8 +43,13 @@ class TFConverter(object):
     self.dt = dt
     self.device = device
     self.tf_graph = {}
+    self.tf_parents = {}
     self.resolved_ops = {}
     self.unused_tensor = set()
+    self.ops = {}
+
+    for op in tf_ops:
+      self.ops[op.name] = op
 
     for op in tf_ops:
       self.resolved_ops[op.name] = 0
@@ -53,6 +58,9 @@ class TFConverter(object):
         if input_name not in self.tf_graph:
           self.tf_graph[input_name] = []
         self.tf_graph[input_name].append(op)
+        if op.name not in self.tf_parents:
+          self.tf_parents[op.name] = []
+        self.tf_parents[op.name].append(self.ops[input_name])
 
   def add_buffer_to_image(self, input_name, input_type):
     output_name = input_name[:-2] + "_b2i" + input_name[-2:]
@@ -465,6 +473,8 @@ class TFConverter(object):
       and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
       final_op = self.tf_graph[final_op.name][0]
       self.resolved_ops[final_op.name] = 1
+      self.unused_tensor.add(get_input_tensor(final_op, 1).name)
+      self.unused_tensor.add(get_input_tensor(final_op, 2).name)
     else:
       raise Exception('Convert atrous conv error: no BatchToSpaceND op')
 
@@ -479,6 +489,36 @@ class TFConverter(object):
     self.add_output_shape(final_op.outputs, op_def)
     self.net_def.op.extend([op_def])
 
+  def is_softmax(self, op):
+    return op.type == 'Softmax' and \
+           len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \
+           len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape'
+
+  def convert_softmax(self, softmax_op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+
+    # deal with first Reshape op
+    parent_reshape_op = self.tf_parents[softmax_op.name][0]
+    op_def.input.extend([parent_reshape_op.inputs[0].name])
+    self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
+    self.resolved_ops[parent_reshape_op.name] = 1
+
+    # deal with Softmax op
+    op_def.name = softmax_op.name
+    op_def.type = softmax_op.type
+    self.resolved_ops[softmax_op.name] = 1
+
+    # deal with last Reshape op
+    reshape_op = self.tf_graph[softmax_op.name][0]
+    self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
+
+    op_def.output.extend([output.name for output in reshape_op.outputs])
+    self.add_output_shape(reshape_op.outputs, op_def)
+    self.resolved_ops[reshape_op.name] = 1
+
   def convert_normal_op(self, op):
     op_def = self.net_def.op.add()
     arg = op_def.arg.add()
@@ -527,12 +567,13 @@ class TFConverter(object):
         self.convert_space_to_batch(op, False)
       elif op.type == 'BatchToSpaceND':
         self.convert_space_to_batch(op, True)
+      elif self.is_softmax(op):
+        self.convert_softmax(op)
       elif op.type in ['Relu']:
         self.convert_normal_op(op)
       else:
         raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
 
-
     for op in self.tf_ops:
       if self.resolved_ops[op.name] == 1:
         continue
diff --git a/tools/side_gcn.config b/tools/side_gcn.config
index d22d730bac70cce3f5c665b5c83c56334f1de319..c7e23e97c0b77fcdc359f347529b06409e77acf4 100644
--- a/tools/side_gcn.config
+++ b/tools/side_gcn.config
@@ -1,2 +1,2 @@
 TF_INPUT_NODE=input_node
-TF_OUTPUT_NODE=GCN/br_result_x/fcn_br
\ No newline at end of file
+TF_OUTPUT_NODE=softmax/Reshape_1
\ No newline at end of file