Refactor: move all files in kernels directory to ops and remove kernels.

1. Move all files in kernels to ops 2. unify the op_def_registry and op_registry. 3. Support op clip: do not link the ops' code when the op is not registered.

Refactor: move all files in kernels directory to ops and remove kernels.
1. Move all files in kernels to ops 2. unify the op_def_registry and op_registry. 3. Support op clip: do not link the ops' code when the op is not registered.
607a1f48 · liuqi · d5679c53 · 607a1f48 · 607a1f48 · 607a1f48
318 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -67,7 +67,6 @@ extra_tests:
  stage: extra_tests
  script:
    - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS

 platform_compatible_tests:

--- a/.travis.yml
+++ b/.travis.yml
@@ -94,7 +94,6 @@ jobs:
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1
        - echo 'Extra Test'
-        - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=False --target_abis=armeabi-v7a || exit 1
        - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1
      env: TYPE=Extra-Test-ARMEABI-v7a
      os: linux
@@ -106,7 +105,6 @@ jobs:
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=arm64-v8a || exit 1
        - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=arm64-v8a || exit 1
        - echo 'Extra Test on ARM64'
-        - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=False --target_abis=arm64-v8a || exit 1
        - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=arm64-v8a || exit 1
      env: TYPE=Extra-Test-ARM64-v8a
      os: linux

--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -5,46 +5,24 @@ You can create a custom op if it is not supported yet.

 To add a custom op, you need to follow these steps:

-Register the new OpDef information
----------------------------------
-Register the OpDef information about which devices the operation could run on.
-Registry file is in `mace/ops/ops_def_register.cc`
-```c++
-#include "mace/ops/ops_def_register.h"
-
-namespace mace {
-namespace ops {
-
-void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("MyCustomOp")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-  ......
-}
-}  // namespace ops
-}  // namespace mace
-
-```
-
 Implement the Operation
 -----------------------
-The Best way is to refer to the implementation of other operator(e.g. `/mace/kernels/activation.cc`)
-
-Define the new Op class in `mace/kernels/my_custom_op.cc`.
-1. CPU code: just write the code in `mace/kernels/my_custom_op.cc`.
-2. GPU code: Kernel API is defined in `mace/kernels/my_custom_op.h`, 
-Kernel based on Image is realized in `mace/kernels/opencl/image/my_custom_op.cc`,
-Kernel based on Buffer is realized in `mace/kernels/opencl/buffer/my_custom_op.cc`.
+The Best way is to refer to the implementation of other operator(e.g. `/mace/ops/activation.cc`)
+
+Define the new Op class in `mace/ops/my_custom_op.cc`.
+1. ARM kernels: Kernel about NEON is located at `mace/ops/arm/my_custom_op.cc`
+2. GPU kernels: OpenCL kernel API is defined in `mace/ops/opencl/my_custom_op.h`, 
+    * Kernel based on Image is realized in `mace/ops/opencl/image/my_custom_op.cc`,
+    * Kernel based on Buffer is realized in `mace/ops/opencl/buffer/my_custom_op.cc`.
+    * OpenCL kernel file is realized in `mace/ops/opencl/cl/my_custom_op.cl`.
+    * Add the path of opencl kernel file in file `mace/repository/opencl-kernel/opencl_kernel_configure.bzl`
 
-The structure like the following code.
+The structure of Op is like the following code.
 ```c++
 #include "mace/core/operator.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class MyCustomOp;
@@ -56,43 +34,34 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationOp<DeviceType::GPU, T> : public Operation {
+class MyCustomOp<DeviceType::GPU, T> : public Operation {
 ...
 };
 #endif  // MACE_ENABLE_OPENCL

-}  // namespace ops
-}  // namespace mace
-
-```
-
-Register the Operation
-----------------------
-1, Add register function in `mace/kernels/my_custom_op.cc`
-```c++
-#include "mace/core/operator.h"
-
-namespace mace {
-namespace kernels {
-
 void RegisterMyCustomOp(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::CPU, float);

 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::GPU, float);

-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
 }
+
 }  // namespace ops
 }  // namespace mace
+
 ```
-2, And then register the new Op in `mace/kernels/ops_register.cc`.
+
+Register the Operation
+-----------------------
+Register the new Op in `mace/ops/ops_register.cc`.
 ```
-#include "mace/kernels/ops_register.h"
+#include "mace/ops/ops_register.h"

 namespace mace {
 namespace ops {

--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -112,7 +112,8 @@ There are two common advanced use cases:
  - converting model to C++ code.
  - tuning GPU kernels for a specific SoC.

-* **Convert model(s) to C++ code**
+Convert model(s) to C++ code
+--------------------------------

    * **1. Change the model deployment file(.yml)**

@@ -204,7 +205,8 @@ There are two common advanced use cases:
            // ... Same with the code in basic usage


-* **Tuning for specific SoC's GPU**
+Tuning for specific SoC's GPU
+---------------------------------

    If you want to use the GPU of a specific device, you can just specify the ``target_socs`` in your YAML file and
    then tune the MACE lib for it (OpenCL kernels), which may get 1~10% performance improvement.
@@ -375,25 +377,52 @@ Use ``-h`` to get detailed help.

 Reduce Library Size
 -------------------
-* **dynamic library**
+* Build for your own usage purpose.
+    * **dynamic library**

-    The generated dynamic library by script ``tools/build-standalone-lib.sh`` is about ``1.6M`` for
-    ``armeabi-v7a`` and ``2.1M`` for ``arm64-v8a``. It can be reduced by modifying some build options.
+        - If the models don't need to run on device ``dsp``, change the build option ``--define hexagon=true``
+          to ``false``. And the library will be decreased about ``100KB``.

-    - If the models don't need to run on device ``dsp``, change the build option ``--define hexagon=true``
-      to ``false``. And the library will be decreased about ``100KB``.
+        - Futher more, if only ``cpu`` device needed, change ``--define opencl=true`` to ``false``. This way
+          will reduce half of library size to about ``700KB`` for ``armeabi-v7a`` and ``1000KB`` for ``arm64-v8a``

-    - Futher more, if only ``cpu`` device needed, change ``--define opencl=true`` to ``false``. This way
-      will reduce half of library size to about ``700KB`` for ``armeabi-v7a`` and ``1000KB`` for ``arm64-v8a``
+        - About ``300KB`` can be reduced when add ``--config symbol_hidden`` building option. It will change
+          the visibility of inner apis in libmace.so and lead to linking error when load model(s) in ``code``
+          but no effection for ``file`` mode.

-    - About ``300KB`` can be reduced when add ``--config symbol_hidden`` building option. It will change
-      the visibility of inner apis in libmace.so and lead to linking error when load model(s) in ``code``
-      but no effection for ``file`` mode.
+    * **static library**

-* **static library**
+        - The methods in dynamic library can be useful for static library too. In additional, the static
+          library may also contain model graph and model datas if the configs ``model_graph_format`` and
+          ``model_data_format`` in deployment file are set to ``code``.

-    - The methods in dynamic library can be useful for static library too. In additional, the static
-      library may also contain model graph and model datas if the configs ``model_graph_format`` and
-      ``model_data_format`` in deployment file are set to ``code``.
+        - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.

-    - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.
+* Remove the unused ops.
+Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
+which will reduce the library size significantly. the final binary just link the registered ops' code.
+```
+#include "mace/ops/ops_register.h"
+
+namespace mace {
+namespace ops {
+// Just leave the ops used in your models
+
+...
+
+}  // namespace ops
+
+
+OpRegistry::OpRegistry() : OpRegistryBase() {
+// Just leave the ops used in your models
+
+  ...
+
+  ops::RegisterMyCustomOp(this);
+
+  ...
+
+}
+
+}  // namespace mace
+```
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -23,9 +23,10 @@
 #include <cstring>

 #include "mace/core/macros.h"
-#include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"

 namespace mace {


--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -27,8 +27,7 @@

 namespace mace {

-SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
-                     const OpRegistryBase *op_registry,
+SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     const NetDef *net_def,
                     Workspace *ws,
                     Device *target_device,
@@ -41,15 +40,7 @@ SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
                        target_device->cpu_runtime()->policy(),
                        target_device->cpu_runtime()->use_gemmlowp())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // Register Operations
-  MaceStatus status;
-  for (int idx = 0; idx < net_def->op_types_size(); ++idx) {
-    status = op_def_registry->Register(net_def->op_types(idx));
-    MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
-  }
  // Create Operations
-  operators_.clear();
-  const OpRegistrationInfo *info;
  DeviceType target_device_type = target_device_->device_type();
  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
@@ -59,16 +50,13 @@ SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
            operator_def, "device", static_cast<int>(target_device_type));
    if (op_device == target_device_type) {
-      // Find op registration information
-      status = op_def_registry->Find(operator_def.type(), &info);
-      MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
      // Get available devices (sorted based on priority)
      OperatorDef temp_def(operator_def);
-      auto available_devices = info->device_place_func_();
+      auto available_devices = op_registry->AvailableDevices(temp_def.type());
      // Find the device type to run the op.
      // If the target_device_type in available devices, use target_device_type,
-      // otherwise, fallback to the first device (top priority).
-      DeviceType device_type = available_devices[0];
+      // otherwise, fallback to CPU device.
+      DeviceType device_type = DeviceType::CPU;
      construct_context.set_device(cpu_device_);
      for (auto device : available_devices) {
        if (device == target_device_type) {

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,8 +21,6 @@
 #include <unordered_map>
 #include <sstream>

-#include "mace/core/op_def_registry.h"
-
 #include "mace/core/operator.h"

 namespace mace {
@@ -45,8 +43,7 @@ class NetBase {

 class SerialNet : public NetBase {
 public:
-  SerialNet(OpDefRegistryBase *op_def_registry,
-            const OpRegistryBase *op_registry,
+  SerialNet(const OpRegistryBase *op_registry,
            const NetDef *net_def,
            Workspace *ws,
            Device *target_device,

--- a/mace/core/op_def_registry.cc
+++ b/mace/core/op_def_registry.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/op_def_registry.h"
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-void AddOpRegistrar(OpDefRegistryBase *registry,
-                    const OpRegistrationBuilder &builder) {
-  registry->AddRegistrar(
-      builder.name(),
-      [builder](OpRegistrationInfo *info){
-        builder.Finalize(info);
-      });
-}
-
-OpRegistrationBuilder::OpRegistrationBuilder(const std::string name)
-    : name_(name) {}
-
-const std::string OpRegistrationBuilder::name() const { return name_; }
-
-OpRegistrationBuilder &OpRegistrationBuilder::SetDevicePlaceFunc(
-    std::vector<DeviceType> (*func)()) {
-  info_.device_place_func_ = func;
-  return *this;
-}
-
-void OpRegistrationBuilder::Finalize(OpRegistrationInfo *info) const {
-  *info = info_;
-}
-
-void OpDefRegistryBase::AddRegistrar(const std::string name,
-                                    const OpRegistrar &registrar) {
-  registrar_.emplace(name, registrar);
-}
-
-MaceStatus OpDefRegistryBase::Register(const std::string &name) {
-  VLOG(3) << "Registering operation definition: " << name;
-  if (registry_.find(name) != registry_.end()) {
-    return MaceStatus::MACE_SUCCESS;
-  }
-  auto iter = registrar_.find(name);
-  if (iter == registrar_.end()) {
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "MACE do not support the operation: " + name);
-  }
-  registry_.emplace(
-      name, std::unique_ptr<OpRegistrationInfo>(new OpRegistrationInfo()));
-  iter->second(registry_[name].get());
-  return MaceStatus::MACE_SUCCESS;
-}
-
-MaceStatus OpDefRegistryBase::Find(const std::string &name,
-                                  const OpRegistrationInfo **info) {
-  auto iter = registry_.find(name);
-  if (iter == registry_.end()) {
-    *info = nullptr;
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "Mace do not support the operation: " + name);
-  }
-  *info = iter->second.get();
-  return MaceStatus::MACE_SUCCESS;
-}
-
-}  // namespace mace
--- a/mace/core/op_def_registry.h
+++ b/mace/core/op_def_registry.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_OP_DEF_REGISTRY_H_
-#define MACE_CORE_OP_DEF_REGISTRY_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
-
-namespace mace {
-
-// Device placement function
-typedef std::function<std::vector<DeviceType>()> DevicePlaceFunc;
-
-struct OpRegistrationInfo {
-  OpRegistrationInfo() = default;
-  explicit OpRegistrationInfo(const DevicePlaceFunc &func)
-      : device_place_func_(func) {}
-
-  DevicePlaceFunc device_place_func_;
-};
-
-class OpRegistrationBuilder {
- public:
-  explicit OpRegistrationBuilder(const std::string name);
-
-  const std::string name() const;
-
-  OpRegistrationBuilder &SetDevicePlaceFunc(
-      std::vector<DeviceType> (*func)());
-
-  void Finalize(OpRegistrationInfo *info) const;
- private:
-  std::string name_;
-  OpRegistrationInfo info_;
-};
-
-class OpDefRegistryBase {
- public:
-  typedef std::function<void(OpRegistrationInfo *)> OpRegistrar;
-  OpDefRegistryBase() = default;
-  virtual ~OpDefRegistryBase() = default;
-  void AddRegistrar(const std::string name, const OpRegistrar &registrar);
-  MaceStatus Register(const std::string &name);
-  MaceStatus Find(const std::string &name, const OpRegistrationInfo **info);
-
- private:
-  std::unordered_map<std::string, OpRegistrar> registrar_;
-  std::unordered_map<
-      std::string,
-      std::unique_ptr<OpRegistrationInfo>> registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OpDefRegistryBase);
-};
-
-void AddOpRegistrar(OpDefRegistryBase *registry,
-                    const OpRegistrationBuilder &builder);
-
-#define MACE_REGISTER_OP_DEF(op_def_registry, builder) \
-  AddOpRegistrar(op_def_registry, builder)
-
-}  // namespace mace
-
-#endif  // MACE_CORE_OP_DEF_REGISTRY_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <sstream>
+#include <map>
 #include <memory>
 #include <vector>

@@ -79,7 +80,26 @@ MaceStatus Operation::Init(OpInitContext *context) {
  return MaceStatus::MACE_SUCCESS;
 }

-OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
+// op registry
+namespace {
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const std::string &op_name);
+
+  OpKeyBuilder &Device(DeviceType device);
+
+  OpKeyBuilder &TypeConstraint(const char *attr_name,
+                               DataType allowed);
+
+  const std::string Build();
+
+ private:
+  std::string op_name_;
+  DeviceType device_type_;
+  std::map<std::string, DataType> type_constraint_;
+};
+
+OpKeyBuilder::OpKeyBuilder(const std::string &op_name) : op_name_(op_name) {}

 OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
  device_type_ = device;
@@ -103,16 +123,53 @@ const std::string OpKeyBuilder::Build() {

  return ss.str();
 }
+}  // namespace
+
+void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
+  devices.insert(device);
+}
+
+void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
+  VLOG(3) << "Registering: " << key;
+  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
+  creators[key] = creator;
+}
+
+MaceStatus OpRegistryBase::Register(const std::string &op_type,
+                                const mace::DeviceType device_type,
+                                const mace::DataType dt,
+                                mace::OpRegistrationInfo::OpCreator creator) {
+  if (registry_.count(op_type) == 0) {
+    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
+        new OpRegistrationInfo);
+  }
+  registry_[op_type]->AddDevice(device_type);
+
+  std::string op_key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dt)
+      .Build();
+  registry_.at(op_type)->Register(op_key, creator);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+const std::set<DeviceType> OpRegistryBase::AvailableDevices(
+    const std::string &op_type) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  return registry_.at(op_type)->devices;
+}

-OpRegistryBase::~OpRegistryBase() = default;

 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
    OpConstructContext *context,
    DeviceType device_type,
    const NetMode mode) const {
  OperatorDef *operator_def = context->operator_def();
-  const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *operator_def, "T", static_cast<int>(DT_FLOAT));
+  const DataType dtype = static_cast<DataType>(
+      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          *operator_def, "T", static_cast<int>(DT_FLOAT)));
  const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
  const NetMode op_mode = static_cast<NetMode>(op_mode_i);
@@ -120,15 +177,20 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
          << operator_def->type() << "<" << dtype << ">" << ") on "
          << device_type;
  if (op_mode == mode) {
-    return registry_.Create(
-        OpKeyBuilder(operator_def->type().data())
-            .Device(device_type)
-            .TypeConstraint("T", static_cast<DataType>(dtype))
-            .Build(),
-        context);
+    const std::string op_type = context->operator_def()->type();
+    MACE_CHECK(registry_.count(op_type) != 0,
+               op_type, " operation is not registered.");
+
+    std::string key = OpKeyBuilder(op_type)
+        .Device(device_type)
+        .TypeConstraint("T", dtype)
+        .Build();
+    if (registry_.at(op_type)->creators.count(key) == 0) {
+      LOG(FATAL) << "Key not registered: " << key;
+    }
+    return registry_.at(op_type)->creators.at(key)(context);
  } else {
    return nullptr;
  }
 }
-
 }  // namespace mace
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -16,13 +16,13 @@
 #define MACE_CORE_OPERATOR_H_

 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
-#include <map>

 #include "mace/core/arg_helper.h"
 #include "mace/core/op_context.h"
-#include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
@@ -160,62 +160,57 @@ class Operation {
 #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
  enum _OutputTags { first_input = 0, __VA_ARGS__ }

-class OpKeyBuilder {
- public:
-  explicit OpKeyBuilder(const char *op_name);

-  OpKeyBuilder &Device(DeviceType device);
+struct OpRegistrationInfo {
+ public:
+  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
+      OpCreator;

-  OpKeyBuilder &TypeConstraint(const char *attr_name,
-                               DataType allowed);
+  OpRegistrationInfo() = default;

-  template <typename T>
-  OpKeyBuilder &TypeConstraint(const char *attr_name);
+  void AddDevice(DeviceType);

-  const std::string Build();
+  void Register(const std::string &key, OpCreator creator);

- private:
-  std::string op_name_;
-  DeviceType device_type_;
-  std::map<std::string, DataType> type_constraint_;
+  std::set<DeviceType> devices;
+  std::unordered_map<std::string, OpCreator> creators;
 };

-template <typename T>
-OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
-  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
-}
-
 class OpRegistryBase {
 public:
-  typedef Registry<std::string,
-                   Operation,
-                   OpConstructContext *>
-      RegistryType;
  OpRegistryBase() = default;
-  virtual ~OpRegistryBase();
-  RegistryType *registry() { return &registry_; }
+  virtual ~OpRegistryBase() = default;
+  MaceStatus Register(const std::string &op_type,
+                      const DeviceType device_type,
+                      const DataType dt,
+                      OpRegistrationInfo::OpCreator creator);
+
+  const std::set<DeviceType> AvailableDevices(
+      const std::string &op_type) const;
+
  std::unique_ptr<Operation> CreateOperation(
      OpConstructContext *context,
      DeviceType device_type,
      const NetMode mode) const;

+  template <class DerivedType>
+  static std::unique_ptr<Operation> DefaultCreator(
+      OpConstructContext *context) {
+    return std::unique_ptr<Operation>(new DerivedType(context));
+  }
+
 private:
-  RegistryType registry_;
+  std::unordered_map<
+      std::string,
+      std::unique_ptr<OpRegistrationInfo>> registry_;
  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
 };

-MACE_DECLARE_REGISTRY(OpRegistry,
-                      Operation,
-                      OpConstructContext *);
-
 #define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
-  MACE_REGISTER_CLASS(OpRegistry,                                      \
-                      op_registry->registry(),                         \
-                      OpKeyBuilder(op_type)                            \
-                        .Device(device)                                \
-                        .TypeConstraint<dt>("T")                       \
-                        .Build(),                                      \
-                      class_name<device, dt>)
+  op_registry->Register(op_type,                                       \
+                        device,                                        \
+                        DataTypeToEnum<dt>::value,                     \
+                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)

 }  // namespace mace


--- a/mace/core/registry.h
+++ b/mace/core/registry.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_REGISTRY_H_
-#define MACE_CORE_REGISTRY_H_
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT(build/c++11)
-#include <string>
-#include <vector>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-template <class SrcType, class ObjectType, class... Args>
-class Registry {
- public:
-  typedef std::function<std::unique_ptr<ObjectType>(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType &key, Creator creator) {
-    VLOG(3) << "Registering: " << key;
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key);
-    registry_[key] = creator;
-  }
-
-  std::unique_ptr<ObjectType> Create(const SrcType &key, Args... args) const {
-    if (registry_.count(key) == 0) {
-      LOG(FATAL) << "Key not registered: " << key;
-    }
-    return registry_.at(key)(args...);
-  }
-
- private:
-  std::map<SrcType, Creator> registry_;
-  std::mutex register_mutex_;
-
-  MACE_DISABLE_COPY_AND_ASSIGN(Registry);
-};
-
-template <class SrcType, class ObjectType, class... Args>
-class Registerer {
- public:
-  Registerer(const SrcType &key,
-             Registry<SrcType, ObjectType, Args...> *registry,
-             typename Registry<SrcType, ObjectType, Args...>::Creator creator) {
-    registry->Register(key, creator);
-  }
-
-  template <class DerivedType>
-  static std::unique_ptr<ObjectType> DefaultCreator(Args... args) {
-    return std::unique_ptr<ObjectType>(new DerivedType(args...));
-  }
-};
-
-#define MACE_CONCATENATE_IMPL(s1, s2) s1##s2
-#define MACE_CONCATENATE(s1, s2) MACE_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define MACE_ANONYMOUS_VARIABLE(str) MACE_CONCATENATE(str, __COUNTER__)
-#else
-#define MACE_ANONYMOUS_VARIABLE(str) MACE_CONCATENATE(str, __LINE__)
-#endif
-
-#define MACE_DECLARE_TYPED_REGISTRY(RegistryName, SrcType, ObjectType, ...) \
-  typedef Registerer<SrcType, ObjectType, ##__VA_ARGS__>                    \
-      Registerer##RegistryName;
-
-#define MACE_DECLARE_REGISTRY(RegistryName, ObjectType, ...)         \
-  MACE_DECLARE_TYPED_REGISTRY(RegistryName, std::string, ObjectType, \
-                              ##__VA_ARGS__)
-
-#define MACE_REGISTER_TYPED_CLASS(RegistryName, registry, key, ...) \
-  Registerer##RegistryName MACE_ANONYMOUS_VARIABLE(RegistryName)( \
-      key, registry, Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
-
-#define MACE_REGISTER_CLASS(RegistryName, registry, key, ...) \
-  MACE_REGISTER_TYPED_CLASS(RegistryName, registry, key, __VA_ARGS__)
-
-}  // namespace mace
-
-#endif  // MACE_CORE_REGISTRY_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -15,10 +15,11 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_

+#include <algorithm>
+#include <functional>
+#include <numeric>
 #include <string>
 #include <vector>
-#include <functional>
-#include <algorithm>

 #include "mace/core/buffer.h"
 #include "mace/core/preallocated_pooled_allocator.h"

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
-# Description:
-# Mace neon kernels.
-#
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//mace:mace.bzl",
-    "if_android",
-    "if_neon_enabled",
-    "if_openmp_enabled",
-    "if_android_armv7",
-    "if_hexagon_enabled",
-    "if_opencl_enabled",
-)
-
-cc_library(
-    name = "kernels",
-    srcs = glob(
-        [
-            "*.cc",
-            "arm/*.cc",
-        ],
-        exclude = [
-            "*_test.cc",
-            "*_benchmark.cc",
-            "arm/*_test.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
-            "lstm_cell.cc",
-            "winograd_transform.cc",
-        ],
-    ) + if_opencl_enabled(glob(
-        [
-            "opencl/*.cc",
-            "opencl/image/*.cc",
-            "opencl/buffer/*.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
-            "lstm_cell.cc",
-            "winograd_transform.cc",
-        ],
-        exclude = [
-            "opencl/*_test.cc",
-        ],
-    )),
-    hdrs = glob(
-        [
-            "*.h",
-            "arm/*.h",
-        ],
-    ) + if_opencl_enabled(glob([
-        "opencl/*.h",
-        "opencl/image/*.h",
-        "opencl/buffer/*.h",
-    ])),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-    ]) + if_android_armv7([
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = if_android(["-lm"]),
-    deps = [
-        "//mace/core",
-        "@gemmlowp",
-        "@tflite",
-    ],
-)
-
-cc_test(
-    name = "kernels_test",
-    testonly = 1,
-    srcs = glob(
-        [
-            "*_test.cc",
-            "arm/*_test.cc",
-            "opencl/*_test.cc",
-        ],
-    ),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = ["-fopenmp"],
-    linkstatic = 1,
-    deps = [
-        ":kernels",
-        "//mace/ops",
-        "@gtest",
-        "@gtest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "kernels_benchmark",
-    testonly = 1,
-    srcs = glob(["*_benchmark.cc"]),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = ["-fopenmp"],
-    linkstatic = 1,
-    deps = [
-        ":kernels",
-        "//mace/core:test_benchmark_main",
-        "//mace/ops",
-        "//third_party/eigen3",
-        "@gemmlowp",
-    ],
-)
--- a/mace/kernels/matmul_benchmark.cc
+++ b/mace/kernels/matmul_benchmark.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Eigen/Dense>
-#include <algorithm>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "public/gemmlowp.h"
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/sgemm.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace gemmlowp {
-
-template<typename tScalar, MapOrder tOrder>
-class Matrix : public MatrixMap<tScalar, tOrder> {
- public:
-  typedef MatrixMap<tScalar, tOrder> Map;
-  typedef MatrixMap<const tScalar, tOrder> ConstMap;
-  typedef typename Map::Scalar Scalar;
-  static const MapOrder Order = tOrder;
-  using Map::cols_;
-  using Map::data_;
-  using Map::kOrder;
-  using Map::rows_;
-  using Map::stride_;
-
- public:
-  Matrix() : Map(nullptr, 0, 0, 0) {}
-
-  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
-
-  Matrix(const Matrix &other) : Map(nullptr, 0, 0, 0) { *this = other; }
-
-  Matrix &operator=(const Matrix &other) {
-    Resize(other.rows_, other.cols_);
-    std::memcpy(data_, other.data_, size() * sizeof(Scalar));
-    return *this;
-  }
-
-  friend bool operator==(const Matrix &a, const Matrix &b) {
-    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
-        !std::memcmp(a.data_, b.data_, a.size());
-  }
-
-  void Resize(int rows, int cols) {
-    rows_ = rows;
-    cols_ = cols;
-    stride_ = kOrder == gemmlowp::MapOrder::ColMajor ? rows : cols;
-    storage.resize(size());
-    data_ = storage.data();
-  }
-
-  int size() const { return rows_ * cols_; }
-
-  Map &map() { return *static_cast<Map *>(this); }
-
-  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
-
- protected:
-  std::vector<Scalar> storage;
-};
-
-template<typename MatrixType>
-void MakeZero(MatrixType *m) {
-  for (int c = 0; c < m->cols(); c++) {
-    for (int r = 0; r < m->rows(); r++) {
-      (*m)(r, c) = 128;
-    }
-  }
-}
-
-}  // namespace gemmlowp
-
-namespace mace {
-namespace kernels {
-namespace test {
-
-// Test the speed of different access order of a NHWC buffer
-
-namespace {
-
-// Matmul with (m, k) x (k, n)
-void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-  // warm up
-  Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  mace::testing::StartTiming();
-  while (iters--) {
-    Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  }
-}
-
-void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-
-  kernels::MatrixMap<const float> matrix_lhs(1, m, k, RowMajor, lhs.data(),
-                                             true);
-  kernels::MatrixMap<const float> matrix_rhs(1, k, n, RowMajor, rhs.data(),
-                                             true);
-  kernels::MatrixMap<float> matrix_result(1, m, n, RowMajor, result.data());
-
-  kernels::SGemm sgemm;
-
-  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-  }
-}
-
-void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
-  Eigen::MatrixXf rhs = Eigen::MatrixXf::Random(k, n);
-  Eigen::MatrixXf result = Eigen::MatrixXf::Zero(m, n);
-  // warm up
-  result = lhs * rhs;
-  mace::testing::StartTiming();
-  while (iters--) {
-    result = lhs * rhs;
-  }
-}
-
-void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
-  mace::testing::StopTiming();
-
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> result;
-  lhs.Resize(rows, depth);
-  rhs.Resize(depth, cols);
-  result.Resize(rows, cols);
-  gemmlowp::MakeZero(&lhs);
-  gemmlowp::MakeZero(&rhs);
-  gemmlowp::MakeZero(&result);
-
-  gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint quantize_down_stage;
-  quantize_down_stage.result_offset_after_shift = 128;
-  quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
-  quantize_down_stage.result_shift = 16;
-  gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
-  const auto output_pipeline =
-      std::make_tuple(quantize_down_stage, saturating_cast_stage);
-
-  auto gemm_context =
-      mace::ops::test::OpTestContext::Get()
-          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
-  MACE_CHECK_NOTNULL(gemm_context);
-
-  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-
-  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
-      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-      -128, output_pipeline);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
-                                     BitDepthParams>(
-        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-        -128, output_pipeline);
-  }
-}
-
-void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
-  mace::testing::StopTiming();
-
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
-  gemmlowp::Matrix<std::int32_t, gemmlowp::MapOrder::ColMajor> result;
-  lhs.Resize(rows, depth);
-  rhs.Resize(depth, cols);
-  result.Resize(rows, cols);
-  gemmlowp::MakeZero(&lhs);
-  gemmlowp::MakeZero(&rhs);
-  gemmlowp::MakeZero(&result);
-
-  const auto output_pipeline = std::make_tuple();
-
-  auto gemm_context =
-      mace::ops::test::OpTestContext::Get()
-          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
-  MACE_CHECK_NOTNULL(gemm_context);
-
-  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-
-  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
-      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-      -128, output_pipeline);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
-                                     BitDepthParams>(
-        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-        -128, output_pipeline);
-  }
-}
-
-}  // namespace
-
-#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
-  static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
-    const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
-    mace::testing::MaccProcessed(macc);                            \
-    mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
-    MatmulBenchmark_##FUNC(iters, M, K, N);                        \
-  }                                                                \
-  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
-
-#define MACE_BM_MATMUL(M, K, N)                          \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace, float);             \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
-  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
-  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
-  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
-
-// Embedding size 384
-MACE_BM_MATMUL(7, 384, 384);
-MACE_BM_MATMUL(7, 384, 1536);
-MACE_BM_MATMUL(7, 1536, 384);
-
-MACE_BM_MATMUL(15, 384, 384);
-MACE_BM_MATMUL(15, 384, 1536);
-MACE_BM_MATMUL(15, 1536, 384);
-
-MACE_BM_MATMUL(1, 256, 256);
-MACE_BM_MATMUL(1, 256, 1536);
-MACE_BM_MATMUL(1, 1536, 256);
-MACE_BM_MATMUL(256, 256, 1);
-MACE_BM_MATMUL(1536, 256, 1);
-MACE_BM_MATMUL(256, 1536, 1);
-MACE_BM_MATMUL(29792, 256, 1);
-MACE_BM_MATMUL(1, 256, 29792);
-MACE_BM_MATMUL(2, 256, 256);
-MACE_BM_MATMUL(2, 256, 1536);
-MACE_BM_MATMUL(2, 1536, 256);
-MACE_BM_MATMUL(3, 256, 256);
-MACE_BM_MATMUL(3, 256, 1536);
-MACE_BM_MATMUL(3, 1536, 256);
-MACE_BM_MATMUL(4, 256, 256);
-MACE_BM_MATMUL(4, 256, 1536);
-MACE_BM_MATMUL(4, 1536, 256);
-MACE_BM_MATMUL(8, 256, 256);
-MACE_BM_MATMUL(8, 256, 1536);
-MACE_BM_MATMUL(8, 1536, 256);
-MACE_BM_MATMUL(10, 256, 256);
-MACE_BM_MATMUL(10, 256, 1536);
-MACE_BM_MATMUL(10, 1536, 256);
-MACE_BM_MATMUL(15, 256, 256);
-MACE_BM_MATMUL(15, 256, 1536);
-MACE_BM_MATMUL(15, 1536, 256);
-
-// Embedding size 128
-MACE_BM_MATMUL(1, 128, 1536);
-MACE_BM_MATMUL(1, 128, 44678);
-
-// MobileNet
-MACE_BM_MATMUL(128, 128, 3136);
-MACE_BM_MATMUL(256, 256, 784);
-MACE_BM_MATMUL(512, 512, 196);
-MACE_BM_MATMUL(1024, 1024, 49);
-
-}  // namespace test
-}  // namespace kernels
-}  // namespace mace
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -40,7 +40,6 @@ cc_library(
    deps = [
        "//mace/public",
        "//mace/ops",
-        "//mace/kernels",
    ],
    alwayslink = 1,
 )
@@ -79,7 +78,7 @@ genrule(
    srcs = [
        "//mace/codegen:generated_version",
        "//mace/core",
-        "//mace/kernels",
+        "//mace/ops:internal_ops",
        "//mace/ops",
        "//mace/libmace",
        "//mace/utils",
@@ -93,7 +92,7 @@ genrule(
          "mri_stream=$$(python $(location //mace/python/tools:archive_static_lib) " +
          "$(locations //mace/codegen:generated_version) " +
          "$(locations //mace/core:core) " +
-          "$(locations //mace/kernels:kernels) " +
+          "$(locations //mace/ops:internal_ops) " +
          "$(locations //mace/ops:ops) " +
          "$(locations //mace/libmace:libmace) " +
          "$(locations //mace/utils:utils) " +

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -22,8 +22,7 @@

 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
-#include "mace/kernels/ops_register.h"
-#include "mace/ops/ops_def_register.h"
+#include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -359,7 +358,6 @@ class MaceEngine::Impl {
 private:
  const unsigned char *model_data_;
  size_t model_data_size_;
-  std::unique_ptr<OpDefRegistryBase> op_def_registry_;
  std::unique_ptr<OpRegistryBase> op_registry_;
  DeviceType device_type_;
  std::unique_ptr<Device> device_;
@@ -377,7 +375,6 @@ class MaceEngine::Impl {
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
    : model_data_(nullptr),
      model_data_size_(0),
-      op_def_registry_(new OpDefRegistry()),
      op_registry_(new OpRegistry),
      device_type_(config.impl_->device_type()),
      device_(nullptr),
@@ -466,7 +463,6 @@ MaceStatus MaceEngine::Impl::Init(

    // Init model
    auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_def_registry_.get(),
        op_registry_.get(),
        net_def,
        ws_.get(),
@@ -474,8 +470,7 @@ MaceStatus MaceEngine::Impl::Init(
        NetMode::INIT));
    MACE_RETURN_IF_ERROR(net->Init());
    MACE_RETURN_IF_ERROR(net->Run());
-    net_ = std::unique_ptr<NetBase>(new SerialNet(op_def_registry_.get(),
-                                                  op_registry_.get(),
+    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
                                                  net_def,
                                                  ws_.get(),
                                                  device_.get()));

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
 # Description:
-# Mace operators.
 #
 package(
    default_visibility = ["//visibility:public"],
@@ -18,18 +17,58 @@ load(
 )

 cc_library(
-    name = "test",
-    testonly = 1,
-    hdrs = glob([
-        "*_test_util.h",
-    ]),
-    srcs = [
-        "ops_test_util.cc",
-    ],
+    name = "internal_ops",
+    srcs = glob(
+        [
+            "*.cc",
+            "arm/*.cc",
+        ],
+        exclude = [
+            "*_test.cc",
+            "*_benchmark.cc",
+            "arm/*_test.cc",
+            "ops_registry.cc",
+            "ops_test_util.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
+        ],
+    ) + if_opencl_enabled(glob(
+        [
+            "opencl/*.cc",
+            "opencl/image/*.cc",
+            "opencl/buffer/*.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
+        ],
+        exclude = [
+            "opencl/*_test.cc",
+        ],
+    )),
+    hdrs = glob(
+        [
+            "*.h",
+            "arm/*.h",
+        ],
+        exclude = [
+            "ops_registry.h",
+            "ops_test_util.h",
+        ]
+    ) + if_opencl_enabled(glob([
+        "opencl/*.h",
+        "opencl/image/*.h",
+        "opencl/buffer/*.h",
+    ])),
    copts = [
        "-Werror",
        "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
        "-mfpu=neon",
@@ -40,20 +79,54 @@ cc_library(
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),
+    linkopts = if_android(["-lm"]),
    deps = [
-        "ops",
-        "//mace/kernels",
-        "@gtest",
+        "//mace/core",
+        "@gemmlowp",
+        "@tflite",
    ],
 )

+
 cc_library(
    name = "ops",
    srcs = [
-        "ops_def_register.cc",
+        "ops_registry.cc"
    ],
    hdrs = [
-        "ops_def_register.h",
+        "ops_registry.h",
+    ],
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    linkopts = if_android(["-lm"]),
+    deps = [
+        "internal_ops",
+    ],
+)
+
+cc_library(
+    name = "test",
+    testonly = 1,
+    hdrs = glob([
+        "*_test_util.h",
+    ]),
+    srcs = [
+        "ops_test_util.cc",
    ],
    copts = [
        "-Werror",
@@ -70,7 +143,8 @@ cc_library(
        "-DMACE_ENABLE_HEXAGON",
    ]),
    deps = [
-        "//mace/core",
+        "ops",
+        "@gtest",
    ],
 )

@@ -78,16 +152,22 @@ cc_test(
    name = "ops_test",
    testonly = 1,
    srcs = glob(
-        ["*_test.cc"],
+        [
+            "*_test.cc",
+            "arm/*_test.cc",
+            "opencl/*_test.cc",
+        ],
    ),
    copts = [
        "-Werror",
        "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
        "-mfpu=neon",
-    ]) + if_android_armv7([
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
@@ -97,8 +177,7 @@ cc_test(
    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
-        ":ops",
-        ":test",
+        "test",
        "@gtest//:gtest_main",
    ],
 )
@@ -110,11 +189,13 @@ cc_test(
    copts = [
        "-Werror",
        "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
        "-mfpu=neon",
-    ]) + if_android_armv7([
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
@@ -124,8 +205,9 @@ cc_test(
    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
-        ":ops",
-        ":test",
+        "test",
        "//mace/core:test_benchmark_main",
+        "//third_party/eigen3",
+        "@gemmlowp",
    ],
 )
--- a/mace/kernels/activation.cc
+++ b/mace/kernels/activation.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"

 #include <memory>

 #include "mace/core/operator.h"

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/activation.h"
+#include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class ActivationOp;
@@ -33,7 +33,7 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit ActivationOp(OpConstructContext *context)
      : Operation(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
@@ -74,7 +74,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit ActivationOp(OpConstructContext *context)
      : Operation(context) {
-    ActivationType type = kernels::StringToActivationType(
+    ActivationType type = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation",
                                              "NOOP"));
    auto relux_max_limit = static_cast<T>(
@@ -114,5 +114,5 @@ void RegisterActivation(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ACTIVATION_H_
-#define MACE_KERNELS_ACTIVATION_H_
+#ifndef MACE_OPS_ACTIVATION_H_
+#define MACE_OPS_ACTIVATION_H_

 #include <algorithm>
 #include <cmath>
 #include <string>

 #include "mace/core/types.h"
-#include "mace/kernels/arm/activation_neon.h"
+#include "mace/ops/arm/activation_neon.h"
 #include "mace/utils/logging.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 enum ActivationType {
  NOOP = 0,
@@ -149,7 +149,7 @@ void PReLUActivation(const T *input_ptr,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ACTIVATION_H_
+#endif  // MACE_OPS_ACTIVATION_H_
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -14,7 +14,6 @@

 #include <string>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -46,7 +45,7 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
        .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluBM")
        .Input("InputImage")
@@ -108,7 +107,7 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluxBM")
        .Input("InputImage")
@@ -186,9 +185,9 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
        .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);

    OpDefBuilder("Activation", "PreluBM")
        .Input("InputImage")
@@ -251,7 +250,7 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "TanhBM")
        .Input("InputImage")
@@ -318,7 +317,7 @@ void SigmoidBenchmark(

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "SigmoidBM")
        .Input("InputImage")

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -33,7 +32,7 @@ void TestSimpleRelu() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluTest")
        .Input("InputImage")
@@ -46,7 +45,7 @@ void TestSimpleRelu() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "ReluTest")
        .Input("Input")
@@ -81,7 +80,7 @@ void TestUnalignedSimpleRelu() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluTest")
        .Input("InputImage")
@@ -94,7 +93,7 @@ void TestUnalignedSimpleRelu() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "ReluTest")
        .Input("Input")
@@ -132,7 +131,7 @@ void TestSimpleRelux() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluxTest")
        .Input("InputImage")
@@ -146,7 +145,7 @@ void TestSimpleRelux() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "ReluxTest")
        .Input("Input")
@@ -182,7 +181,7 @@ void TestSimpleReluRelux() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "ReluxTest")
        .Input("InputImage")
@@ -196,7 +195,7 @@ void TestSimpleReluRelux() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "ReluxTest")
        .Input("Input")
@@ -237,9 +236,9 @@ void TestSimplePrelu() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);

    OpDefBuilder("Activation", "PreluTest")
        .Input("InputImage")
@@ -253,7 +252,7 @@ void TestSimplePrelu() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "PreluTest")
        .Input("Input")
@@ -293,7 +292,7 @@ void TestSimpleTanh() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "TanhTest")
        .Input("InputImage")
@@ -306,7 +305,7 @@ void TestSimpleTanh() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "TanhTest")
        .Input("Input")
@@ -346,7 +345,7 @@ void TestSimpleSigmoid() {

  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("Activation", "SigmoidTest")
        .Input("InputImage")
@@ -359,7 +358,7 @@ void TestSimpleSigmoid() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("Activation", "SigmoidTest")
        .Input("Input")

--- a/mace/kernels/addn.cc
+++ b/mace/kernels/addn.cc
@@ -22,11 +22,11 @@
 #include "mace/core/operator.h"

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/addn.h"
+#include "mace/ops/opencl/image/addn.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 static constexpr int kCostPerGroup = 1024;

@@ -142,5 +142,5 @@ void RegisterAddN(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -14,7 +14,6 @@

 #include <string>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -37,7 +36,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    for (int i = 0; i < inputs; ++i) {
      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
                          MakeString("InputImage", i).c_str(),
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
    }
    OpDefBuilder op_def_builder("AddN", "AddNBM");
    for (int i = 0; i < inputs; ++i) {

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -69,7 +68,7 @@ void SimpleAdd3() {
    for (int i = 0; i < input_num; ++i) {
      BufferToImage<D, half>(&net, MakeString("Input", i),
                             MakeString("InputImage", i),
-                             kernels::BufferType::IN_OUT_CHANNEL);
+                             ops::BufferType::IN_OUT_CHANNEL);
    }

    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
@@ -84,7 +83,7 @@ void SimpleAdd3() {
    net.RunOp(D);

    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    OpDefBuilder("AddN", "AddNTest")
        .Input("Input0")
@@ -143,7 +142,7 @@ void RandomTest() {
    for (int i = 0; i < input_num; ++i) {
      BufferToImage<D, half>(&net, MakeString("Input", i),
                             MakeString("InputImage", i),
-                             kernels::BufferType::IN_OUT_CHANNEL);
+                             ops::BufferType::IN_OUT_CHANNEL);
    }

    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
@@ -158,7 +157,7 @@ void RandomTest() {
    net.RunOp(D);

    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-2);

--- a/mace/kernels/argmax.cc
+++ b/mace/kernels/argmax.cc
@@ -21,7 +21,7 @@
 #include "mace/core/operator.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class ArgMaxOp : public Operation {
@@ -84,5 +84,5 @@ void RegisterArgMax(OpRegistryBase *op_registry) {
                   DeviceType::CPU, float);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/kernels/arm/activation_neon.cc
+++ b/mace/kernels/arm/activation_neon.cc
@@ -17,10 +17,10 @@
 #endif

 #include <algorithm>
-#include "mace/kernels/arm/activation_neon.h"
+#include "mace/ops/arm/activation_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void ReluNeon(const float *input, const index_t size, float *output) {
 #if defined(MACE_ENABLE_NEON)
@@ -67,5 +67,5 @@ void ReluxNeon(const float *input, const float limit,
 #endif
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/activation_neon.h
+++ b/mace/kernels/arm/activation_neon.h
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_
-#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+#ifndef MACE_OPS_ARM_ACTIVATION_NEON_H_
+#define MACE_OPS_ARM_ACTIVATION_NEON_H_

 #include "mace/core/types.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void ReluNeon(const float *input, const index_t size, float *output);

 void ReluxNeon(const float *input, const float limit,
               const index_t size, float *output);

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+#endif  // MACE_OPS_ARM_ACTIVATION_NEON_H_
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARM_CONV_2D_NEON_H_
-#define MACE_KERNELS_ARM_CONV_2D_NEON_H_
+#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
+#define MACE_OPS_ARM_CONV_2D_NEON_H_

 #include "mace/core/types.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void Conv2dNeonK1x1S1(const float *input,
                      const float *filter,
@@ -115,7 +115,7 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ARM_CONV_2D_NEON_H_
+#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
--- a/mace/kernels/arm/conv_2d_neon_15x1.cc
+++ b/mace/kernels/arm/conv_2d_neon_15x1.cc
@@ -16,11 +16,11 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/utils/utils.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 inline void Conv2dCPUK15x1Calc(const float *in_ptr,
                               const float *filter_ptr,
@@ -157,5 +157,5 @@ void Conv2dNeonK15x1S1(const float *input,
  }        // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_1x1.cc
+++ b/mace/kernels/arm/conv_2d_neon_1x1.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void Conv2dNeonK1x1S1(const float *input,
                      const float *filter,
@@ -44,5 +44,5 @@ void Conv2dNeonK1x1S1(const float *input,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_1x15.cc
+++ b/mace/kernels/arm/conv_2d_neon_1x15.cc
@@ -16,12 +16,12 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 inline void Conv2dCPUK1x15Calc(const float *in_ptr,
                               const float *filter_ptr,
@@ -143,5 +143,5 @@ void Conv2dNeonK1x15S1(const float *input,
  }        // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_1x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_1x7.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK1x7S1(const float *input,
@@ -247,5 +247,5 @@ void Conv2dNeonK1x7S1(const float *input,
  }      // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -17,10 +17,10 @@
 #endif

 #include "mace/core/macros.h"
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
@@ -658,5 +658,5 @@ void Conv2dNeonK3x3S2(const float *input,
  }      // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
  /* load filter (4 outch x 1 height x 4 width) */        \
@@ -215,5 +215,5 @@ void Conv2dNeonK5x5S1(const float *input,
  }          // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_7x1.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x1.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 // Ho = 4, Wo = 1, Co = 4
 void Conv2dNeonK7x1S1(const float *input,
@@ -287,5 +287,5 @@ void Conv2dNeonK7x1S1(const float *input,
  }      // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif

-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
  /* load filter (4 outch x 1 height x 4 width) */ \
@@ -638,5 +638,5 @@ void Conv2dNeonK7x7S3(const float *input,
  }          // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_winograd.cc
+++ b/mace/kernels/arm/conv_winograd.cc
@@ -14,11 +14,11 @@

 #include <algorithm>

-#include "mace/kernels/arm/conv_winograd.h"
-#include "mace/kernels/gemm.h"
+#include "mace/ops/arm/conv_winograd.h"
+#include "mace/ops/gemm.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 namespace {
 // NCHW => NTCB (T: in tile pixels, B: tile indices)
@@ -747,5 +747,5 @@ void ConvRef3x3s1(const float *input,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/conv_winograd.h
+++ b/mace/kernels/arm/conv_winograd.h
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARM_CONV_WINOGRAD_H_
-#define MACE_KERNELS_ARM_CONV_WINOGRAD_H_
+#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_
+#define MACE_OPS_ARM_CONV_WINOGRAD_H_

 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif

 #include "mace/core/types.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void TransformFilter4x4(const float *filter,
                        const index_t in_channels,
@@ -70,7 +70,7 @@ void ConvRef3x3s1(const float *input,
                  const index_t out_channels,
                  float *output);

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ARM_CONV_WINOGRAD_H_
+#endif  // MACE_OPS_ARM_CONV_WINOGRAD_H_
--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/kernels/arm/conv_winograd_test.cc
@@ -19,10 +19,10 @@

 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/kernels/arm/conv_winograd.h"
+#include "mace/ops/arm/conv_winograd.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 TEST(ConvWinogradTest, winograd) {
  index_t batch = 1;
@@ -62,11 +62,11 @@ TEST(ConvWinogradTest, winograd) {
    return std::max(-1.0f, std::min(1.0f, nd(gen)));
  });

-  kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
+  ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
                        in_channels, out_channels, output_data_ref);

  SGemm sgemm;
-  kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
+  ops::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
                             in_width, in_channels, out_channels, 6,
                             output_data, &sgemm, nullptr);

@@ -76,5 +76,5 @@ TEST(ConvWinogradTest, winograd) {
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/deconv_2d_neon.h
+++ b/mace/kernels/arm/deconv_2d_neon.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_
-#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+#ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_
+#define MACE_OPS_ARM_DECONV_2D_NEON_H_

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -22,7 +22,7 @@
 #include "mace/core/types.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void Deconv2dNeonK3x3S1(const float *input,
                        const float *filter,
@@ -90,7 +90,7 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 }
 #endif

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+#endif  // MACE_OPS_ARM_DECONV_2D_NEON_H_
--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "mace/core/macros.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
+#include "mace/ops/arm/deconv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void Deconv2dNeonK3x3S1(const float *input,
                        const float *filter,
@@ -387,5 +387,5 @@ void Deconv2dNeonK3x3S2(const float *input,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "mace/core/macros.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
+#include "mace/ops/arm/deconv_2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void Deconv2dNeonK4x4S1(const float *input,
                        const float *filter,
@@ -501,5 +501,5 @@ void Deconv2dNeonK4x4S2(const float *input,
  }
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
-#define MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#ifndef MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#define MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_

 #include "mace/core/types.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const float *filter,
@@ -42,7 +42,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
                               const index_t valid_w_stop,
                               float *output);

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#endif  // MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -17,10 +17,10 @@
 #endif

 #include "mace/core/macros.h"
-#include "mace/kernels/arm/depthwise_conv2d_neon.h"
+#include "mace/ops/arm/depthwise_conv2d_neon.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 namespace {
 void DepthwiseConv2dPixel(const float *in_base,
@@ -381,5 +381,5 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
  }    // b
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/batch_norm.cc
+++ b/mace/kernels/batch_norm.cc
@@ -17,13 +17,13 @@
 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/batch_norm.h"
+#include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class BatchNormOp;
@@ -35,7 +35,7 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
      : Operation(context),
        epsilon_(Operation::GetOptionalArg<float>("epsilon",
                                                 static_cast<float>(1e-4))),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}

@@ -144,7 +144,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
      : Operation(context) {
    float epsilon = Operation::GetOptionalArg<float>(
        "epsilon", static_cast<float>(1e-4));
-    ActivationType activation = kernels::StringToActivationType(
+    ActivationType activation = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
    if (context->device()->opencl_runtime()->UseImageMemory()) {
@@ -205,5 +205,5 @@ void RegisterBatchNorm(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -54,15 +53,15 @@ void BatchNorm(
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    OpDefBuilder("BatchNorm", "BatchNormBM")
        .Input("InputImage")
        .Input("ScaleImage")

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -51,15 +50,15 @@ void Simple() {
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);

    OpDefBuilder("BatchNorm", "BatchNormTest")
        .Input("InputImage")
@@ -75,7 +74,7 @@ void Simple() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  }

  // Check
@@ -135,15 +134,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputImage")
@@ -165,7 +164,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                          1e-5, 1e-4);
 }
@@ -214,15 +213,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputImage")
@@ -245,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                          1e-1, 1e-2);
 }
@@ -294,15 +293,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputImage")
@@ -324,7 +323,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                          1e-5, 1e-4);
 }
@@ -373,15 +372,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputImage")
@@ -404,7 +403,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                          1e-1, 1e-2);
 }

--- a/mace/kernels/batch_to_space.cc
+++ b/mace/kernels/batch_to_space.cc
@@ -17,11 +17,11 @@

 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/batch_to_space.h"
+#include "mace/ops/opencl/image/batch_to_space.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 class BatchToSpaceOpBase : public Operation {
 public:
@@ -303,5 +303,5 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -42,7 +41,7 @@ void BMBatchToSpace(
        .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
        .Input("InputImage")
        .Output("OutputImage")

--- a/mace/kernels/bias_add.cc
+++ b/mace/kernels/bias_add.cc
@@ -17,13 +17,13 @@
 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/bias_add.h"
+#include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class BiasAddOp;
@@ -139,5 +139,5 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -47,9 +46,9 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("BiasAdd", "BiasAddBM")
        .Input("InputImage")
        .Input("BiasImage")

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -46,9 +45,9 @@ void BiasAddSimple() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);

    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("InputImage")
@@ -60,7 +59,7 @@ void BiasAddSimple() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -116,9 +115,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);

  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("InputImage")
@@ -131,7 +130,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }

@@ -172,9 +171,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {

  // Run on opencl
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);

  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("InputImage")
@@ -187,7 +186,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  net.Sync();

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }


--- a/mace/kernels/buffer_inverse_transform.cc
+++ b/mace/kernels/buffer_inverse_transform.cc
@@ -15,11 +15,11 @@
 #include <memory>

 #include "mace/core/operator.h"
-#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
-#include "mace/kernels/opencl/image/image_to_buffer.h"
+#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
+#include "mace/ops/opencl/image/image_to_buffer.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class BufferInverseTransformOp;
@@ -41,9 +41,9 @@ class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);

-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    ops::BufferType type =
+        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));

    return kernel_->Compute(context, input, type,
                            wino_blk_size_, output);
@@ -63,5 +63,5 @@ void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
                   BufferInverseTransformOp, DeviceType::GPU, half);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -54,103 +54,103 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
 }

 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
 }

 TEST(BufferToImageTest, ArgMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
 }

 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
 }

 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                   {1, 2, 3, 1});
 }

 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                   {1, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                   {3, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                   {3, 13, 17, 128});
 }

 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                   {3, 64, 64, 256});
 }

 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {5, 3, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {13, 17, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {512, 128, 1, 1});
 }

 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {3, 5, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {17, 13, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                   {256, 128, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                   {1, 3, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                   {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightWidthLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                   {64, 64, 11, 13});
 }

 TEST(BufferToImageTest, WeightHeightSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                   {2, 1, 1, 1});
 }

 TEST(BufferToImageTest, WeightHeightMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                   {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightHeightLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                   {64, 16, 11, 13});
 }

@@ -188,7 +188,7 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
                                                          {11});
 }

@@ -233,7 +233,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {
      0xCD, 0x3C, 0x33, 0x40,
  };
-  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
                                                            {2}, input_data);
 }


--- a/mace/kernels/buffer_transform.cc
+++ b/mace/kernels/buffer_transform.cc
@@ -15,11 +15,11 @@
 #include <memory>

 #include "mace/core/operator.h"
-#include "mace/kernels/opencl/buffer/buffer_transform.h"
-#include "mace/kernels/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/buffer/buffer_transform.h"
+#include "mace/ops/opencl/image/buffer_to_image.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class BufferTransformOp;
@@ -41,9 +41,9 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);

-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    ops::BufferType type =
+        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));

    return kernel_->Compute(context, input, type,
                            wino_blk_size_, output);
@@ -63,5 +63,5 @@ void RegisterBufferTransform(OpRegistryBase *op_registry) {
                   BufferTransformOp, DeviceType::GPU, half);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -69,12 +69,12 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST_F(BufferTransformTest, FloatToHalf) {
-  TestBidirectionTransform<float, half>(kernels::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
                                        {1, 2, 3, 4});
 }

 TEST_F(BufferTransformTest, HalfToHalf) {
-  TestBidirectionTransform<half, half>(kernels::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
                                       {1, 2, 3, 4});
 }

@@ -85,7 +85,7 @@ void TestArgumentTransform(const index_t input_size) {
  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::ARGUMENT)
+      .AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());


--- a/mace/kernels/cast.cc
+++ b/mace/kernels/cast.cc
@@ -15,7 +15,7 @@
 #include "mace/core/operator.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, typename SrcType>
 class CastOp : public Operation {
@@ -57,5 +57,5 @@ void RegisterCast(OpRegistryBase *op_registry) {
                   DeviceType::CPU, int32_t);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/kernels/channel_shuffle.cc
+++ b/mace/kernels/channel_shuffle.cc
@@ -16,11 +16,11 @@

 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/channel_shuffle.h"
+#include "mace/ops/opencl/image/channel_shuffle.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class ChannelShuffleOp;
@@ -115,5 +115,5 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -44,7 +43,7 @@ void ChannelShuffle(
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
        .Input("InputImage")

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -61,7 +60,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);

  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
      .Input("InputImage")
@@ -74,7 +73,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {

  // Transfer output
  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);

  // Check
  auto expected = net.CreateTensor<float>(

--- a/mace/kernels/concat.cc
+++ b/mace/kernels/concat.cc
@@ -18,11 +18,11 @@
 #include "mace/utils/quantize.h"

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/concat.h"
+#include "mace/ops/opencl/image/concat.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 class ConcatOpBase : public Operation {
 public:
@@ -206,6 +206,9 @@ void RegisterConcat(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, float);

+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, int32_t);
+
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, uint8_t);

@@ -218,5 +221,5 @@ void RegisterConcat(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -90,9 +89,9 @@ void OpenclConcatHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Concat", "ConcatBM")
      .Input("InputImage0")
      .Input("InputImage1")

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -262,7 +262,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
    net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
                                                  inputs[i]);
    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
-                                      kernels::BufferType::IN_OUT_CHANNEL);
+                                      ops::BufferType::IN_OUT_CHANNEL);
  }

  auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -279,7 +279,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
  net.RunOp(DeviceType::GPU);

  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);

  // Check
  auto output = net.GetOutput("Output");

--- a/mace/kernels/conv_2d.cc
+++ b/mace/kernels/conv_2d.cc
@@ -26,20 +26,20 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/conv_2d_neon.h"
-#include "mace/kernels/arm/conv_winograd.h"
-#include "mace/kernels/conv_pool_2d_base.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/gemmlowp_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_winograd.h"
+#include "mace/ops/conv_pool_2d_base.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/gemmlowp_util.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/conv_2d.h"
-#include "mace/kernels/opencl/buffer/conv_2d.h"
+#include "mace/ops/opencl/image/conv_2d.h"
+#include "mace/ops/opencl/buffer/conv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class Conv2dOp;
@@ -49,7 +49,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
@@ -712,7 +712,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -950,7 +950,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
@@ -999,5 +999,5 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,9 +14,8 @@

 #include <algorithm>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -66,11 +65,11 @@ void Conv2d(int iters,
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 #include <vector>

-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -61,11 +61,11 @@ void TestNHWCSimple3x3VALID() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -81,7 +81,7 @@ void TestNHWCSimple3x3VALID() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

  } else {
    MACE_NOT_IMPLEMENTED;
@@ -127,11 +127,11 @@ void TestNHWCSimple3x3SAME() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

  } else {
    MACE_NOT_IMPLEMENTED;
@@ -213,9 +213,9 @@ void TestNHWCSimple3x3WithoutBias() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -230,7 +230,7 @@ void TestNHWCSimple3x3WithoutBias() {
    net.RunOp(D);
    // Transfer output
    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -287,11 +287,11 @@ void TestNHWCCombined3x3() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputImage")
@@ -307,7 +307,7 @@ void TestNHWCCombined3x3() {
    net.RunOp(D);

    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -362,11 +362,11 @@ void TestFusedNHWCSimple3x3VALID() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -383,7 +383,7 @@ void TestFusedNHWCSimple3x3VALID() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

  } else {
    MACE_NOT_IMPLEMENTED;
@@ -425,9 +425,9 @@ void TestFusedNHWCSimple3x3WithoutBias() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);

    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputImage")
@@ -443,7 +443,7 @@ void TestFusedNHWCSimple3x3WithoutBias() {
    net.RunOp(D);
    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -505,11 +505,11 @@ void TestConv1x1() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Filter", "FilterImage",
-                            kernels::BufferType::CONV2D_FILTER);
+                            ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputImage")
@@ -524,7 +524,7 @@ void TestConv1x1() {
    net.RunOp(D);

    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -596,11 +596,11 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,

    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -616,7 +616,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    net.RunOp(D);

    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };
@@ -705,11 +705,11 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,

    // run on gpu
    BufferToImage<D, half>(&net, "Input", "InputImage",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           kernels::BufferType::CONV2D_FILTER);
+                           ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           kernels::BufferType::ARGUMENT);
+                           ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -725,7 +725,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    net.RunOp(D);

    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-2);
@@ -857,11 +857,11 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,

    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -877,7 +877,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
    net.RunOp(D);

    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };
@@ -954,11 +954,11 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,

    // run on gpu
    BufferToImage<D, half>(&net, "Input", "InputImage",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           kernels::BufferType::CONV2D_FILTER);
+                           ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           kernels::BufferType::ARGUMENT);
+                           ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -974,7 +974,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    net.RunOp(D);

    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                            1e-1);
  };
@@ -1041,11 +1041,11 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,

    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -1060,7 +1060,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    net.RunOp(D);

    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };

--- a/mace/kernels/conv_pool_2d_base.h
+++ b/mace/kernels/conv_pool_2d_base.h
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_CONV_POOL_2D_BASE_H_
-#define MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
+#define MACE_OPS_CONV_POOL_2D_BASE_H_

 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 class ConvPool2dOpBase : public Operation {
 public:
@@ -40,7 +40,7 @@ class ConvPool2dOpBase : public Operation {
  std::vector<int> dilations_;
 };

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#endif  // MACE_OPS_CONV_POOL_2D_BASE_H_
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"

 #include <algorithm>
 #include <cmath>
 #include <vector>

 namespace mace {
-namespace kernels {
+namespace ops {

 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
@@ -463,5 +463,5 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
  return MaceStatus::MACE_SUCCESS;
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_CONV_POOL_2D_UTIL_H_
-#define MACE_KERNELS_CONV_POOL_2D_UTIL_H_
+#ifndef MACE_OPS_CONV_POOL_2D_UTIL_H_
+#define MACE_OPS_CONV_POOL_2D_UTIL_H_

 #include "mace/core/tensor.h"

@@ -30,7 +30,7 @@ enum RoundType {
  CEIL = 1,
 };

-namespace kernels {
+namespace ops {

 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
@@ -113,7 +113,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input,
                                   Tensor *output_tensor,
                                   bool padding_same_value = false);

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_CONV_POOL_2D_UTIL_H_
+#endif  // MACE_OPS_CONV_POOL_2D_UTIL_H_
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -29,7 +29,7 @@ TEST(CoreTest, INIT_MODE) {
  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("B2IOutput")
-      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
+      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
      .Finalize(&op_defs[op_defs.size() - 1]);

@@ -46,18 +46,16 @@ TEST(CoreTest, INIT_MODE) {
  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
      .Input("B2IOutput")
      .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
+      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
      .Finalize(&op_defs[op_defs.size() - 1]);

  NetDef net_def;
  for (auto &op_def : op_defs) {
    net_def.add_op()->CopyFrom(op_def);
-    net_def.add_op_types(op_def.type());
  }
-  std::shared_ptr<OpDefRegistryBase> op_def_registry(new OpDefRegistry());
-  std::shared_ptr<OpRegistryBase> op_registry(new OpRegistry());
+  std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
  auto net = std::unique_ptr<NetBase>(new SerialNet(
-      op_def_registry.get(), op_registry.get(), &net_def, &ws, device,
+      op_registry.get(), &net_def, &ws, device,
      NetMode::INIT));
  MaceStatus status = net->Init();
  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
@@ -67,7 +65,7 @@ TEST(CoreTest, INIT_MODE) {
  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
  net = std::unique_ptr<NetBase>(new SerialNet(
-      op_def_registry.get(), op_registry.get(), &net_def, &ws, device));
+      op_registry.get(), &net_def, &ws, device));
  status = net->Init();
  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
  status = net->Run();

--- a/mace/kernels/crop.cc
+++ b/mace/kernels/crop.cc
@@ -16,11 +16,11 @@

 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/crop.h"
+#include "mace/ops/opencl/image/crop.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class CropOp : public Operation {
@@ -143,5 +143,5 @@ void RegisterCrop(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -81,9 +80,9 @@ void OpenclCropHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Crop", "CropBM")
      .Input("InputImage0")
      .Input("InputImage1")

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -36,9 +35,9 @@ void RunCrop(const std::vector<index_t> &input_shape,

  if (D == GPU) {
    BufferToImage<D, float>(&net, "Input0", "InputImage0",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Input1", "InputImage1",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Crop", "CropTest")
        .Input("InputImage0")
        .Input("InputImage1")
@@ -69,7 +68,7 @@ void RunCrop(const std::vector<index_t> &input_shape,

  if (D == GPU) {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);

--- a/mace/kernels/deconv_2d.cc
+++ b/mace/kernels/deconv_2d.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/deconv_2d.h"
+#include "mace/ops/deconv_2d.h"

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -27,16 +27,16 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/deconv_2d.h"
+#include "mace/ops/opencl/image/deconv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 class Deconv2dOpBase : public Operation {
 public:
@@ -46,9 +46,9 @@ class Deconv2dOpBase : public Operation {
        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
            "padding", static_cast<int>(SAME)))),
        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
-        model_type_(static_cast<kernels::FrameworkType>(
+        model_type_(static_cast<ops::FrameworkType>(
                        Operation::GetOptionalArg<int>("framework_type", 0))),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -180,7 +180,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    const Tensor *filter = this->Input(1);
    const Tensor *bias = nullptr;
    const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == kernels::CAFFE) {
+    if (model_type_ == ops::CAFFE) {
      bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
    } else {
      output_shape_tensor =
@@ -491,7 +491,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
    const Tensor *filter = this->Input(1);
    const Tensor *bias = nullptr;
    const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == kernels::CAFFE) {
+    if (model_type_ == ops::CAFFE) {
      bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
    } else {
      output_shape_tensor =
@@ -557,5 +557,5 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_DECONV_2D_H_
-#define MACE_KERNELS_DECONV_2D_H_
+#ifndef MACE_OPS_DECONV_2D_H_
+#define MACE_OPS_DECONV_2D_H_

 namespace mace {
-namespace kernels {
+namespace ops {

 enum FrameworkType {
  TENSORFLOW = 0,
  CAFFE = 1,
 };

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_DECONV_2D_H_
+#endif  // MACE_OPS_DECONV_2D_H_
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,9 +14,8 @@

 #include <algorithm>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -54,11 +53,11 @@ static void Deconv2d(int iters,
                                    {batch, out_h, out_w, output_channels});
  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("Deconv2D", "Deconv2dTest")
        .Input("InputImage")
        .Input("FilterImage")

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -15,8 +15,8 @@
 #include <fstream>
 #include <vector>

-#include "mace/kernels/deconv_2d.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/deconv_2d.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -38,7 +38,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                   const std::vector<float> &filter_data,
                   const std::vector<index_t> &expected_shape,
                   const std::vector<float> &expected_data,
-                   kernels::FrameworkType model_type) {
+                   ops::FrameworkType model_type) {
  OpsTestNet net;
  // Add input data
  const index_t batch = input_shape[0];
@@ -50,12 +50,12 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            kernels::BufferType::CONV2D_FILTER);
-    if (model_type == kernels::FrameworkType::CAFFE) {
+                            ops::BufferType::CONV2D_FILTER);
+    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputImage")
          .Input("FilterImage")
@@ -85,12 +85,12 @@ void RunTestSimple(const std::vector<index_t> &input_shape,

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);

-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputNCHW")
          .Input("FilterOIHW")
@@ -138,7 +138,7 @@ void TestNHWCSimple3x3SAME_S1() {
                   {4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7,
                    6.5, 6.6, 6.7, 9.5, 9.6, 9.7, 6.5, 6.6, 6.7,
                    4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                   1, Padding::VALID, {2, 2},
                   {0}, {3, 3, 3, 1},
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME_S1() {
                   {1, 3, 3, 3},
                   {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
                    9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                   1, Padding::SAME, {},
                   {1, 3, 3, 3}, {3, 3, 3, 1},
@@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME_S1() {
                   {54,  66,  78,  126, 147, 168, 130, 146, 162,
                    198, 225, 252, 405, 450, 495, 366, 399, 432,
                    354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                   1, Padding::SAME, {2, 2},
                   {0}, {3, 3, 3, 1},
@@ -167,7 +167,7 @@ void TestNHWCSimple3x3SAME_S1() {
                   {54,  66,  78,  126, 147, 168, 130, 146, 162,
                    198, 225, 252, 405, 450, 495, 366, 399, 432,
                    354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
 }

 template <DeviceType D>
@@ -185,7 +185,7 @@ void TestNHWCSimple3x3SAME_S2() {
                    1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                    2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                    1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                   2, Padding::SAME, {2, 2},
                   {0}, {3, 3, 3, 1},
@@ -198,7 +198,7 @@ void TestNHWCSimple3x3SAME_S2() {
                    1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                    2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                    1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                   2, Padding::SAME, {},
                   {1, 6, 6, 3}, {3, 3, 3, 1},
@@ -216,7 +216,7 @@ void TestNHWCSimple3x3SAME_S2() {
                    83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168,
                    304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192,
                    207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
  RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                   2, Padding::SAME, {2, 2},
                   {0}, {3, 3, 3, 1},
@@ -229,7 +229,7 @@ void TestNHWCSimple3x3SAME_S2() {
                    140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300,
                    142, 155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105,
                    192, 207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
 }

 template <DeviceType D>
@@ -246,7 +246,7 @@ void TestNHWCSimple3x3SAME_S2_1() {
                    18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18,
                    30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
                    12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }

 template <DeviceType D>
@@ -271,7 +271,7 @@ void TestNHWCSimple3x3VALID_S2() {
                    1, 1, 1,
                    1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                    1, 1, 1},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }

 template <DeviceType D>
@@ -288,7 +288,7 @@ void TestNHWCSimple3x3VALID_S1() {
                    366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402,
                    630, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147,
                    306, 321, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }

 template <DeviceType D>
@@ -297,7 +297,7 @@ void TestNHWCSimple2x2SAME() {
                   {1, 2, 2, 1}, {3, 3, 1, 1},
                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
                   {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }

 template <DeviceType D>
@@ -308,7 +308,7 @@ void TestNHWCSimple2x2VALID() {
      {1, 5, 5, 1},
      {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
       2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f},
-      kernels::FrameworkType::TENSORFLOW);
+      ops::FrameworkType::TENSORFLOW);
 }
 }  // namespace

@@ -397,11 +397,11 @@ void TestComplexDeconvNxNS12(const int batch,
    std::vector<int> paddings;
    std::vector<int> output_shape;

-    kernels::FrameworkType model_type =
+    ops::FrameworkType model_type =
        padding < 0 ?
-        kernels::FrameworkType::TENSORFLOW : kernels::FrameworkType::CAFFE;
+        ops::FrameworkType::TENSORFLOW : ops::FrameworkType::CAFFE;

-    if (model_type == kernels::FrameworkType::TENSORFLOW) {
+    if (model_type == ops::FrameworkType::TENSORFLOW) {
      if (type == Padding::SAME) {
        out_h = (height - 1) * stride_h + 1;
        out_w = (width - 1) * stride_w + 1;
@@ -421,7 +421,7 @@ void TestComplexDeconvNxNS12(const int batch,
      paddings.push_back(padding);
    }

-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputNCHW")
          .Input("Filter")
@@ -458,13 +458,13 @@ void TestComplexDeconvNxNS12(const int batch,

    // run on gpu
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);

-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputImage")
          .Input("FilterImage")
@@ -492,7 +492,7 @@ void TestComplexDeconvNxNS12(const int batch,
    net.RunOp(D);

    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                            1e-4);
  };

--- a/mace/kernels/depth_to_space.cc
+++ b/mace/kernels/depth_to_space.cc
@@ -17,11 +17,11 @@

 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/depth_to_space.h"
+#include "mace/ops/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class DepthToSpaceOp : public Operation {
@@ -127,5 +127,5 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -44,7 +43,7 @@ void DepthToSpace(
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
        .Input("InputImage")

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -15,7 +15,6 @@
 #include <fstream>

 #include <vector>
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -47,7 +46,7 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,

  } else {
    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
        .Input("InputImage")
        .Output("OutputImage")
@@ -59,7 +58,7 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,

  if (D == DeviceType::GPU) {
    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          kernels::BufferType::IN_OUT_CHANNEL);
+                                          ops::BufferType::IN_OUT_CHANNEL);
  }
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -136,7 +135,7 @@ void RandomTest(const int block_size,
                                                  NHWC);

  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);

  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("InputImg")
@@ -149,7 +148,7 @@ void RandomTest(const int block_size,
  net.RunOp(D);

  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
    ExpectTensorNear<float>(*net.GetTensor("Output"),

--- a/mace/kernels/depthwise_conv2d.cc
+++ b/mace/kernels/depthwise_conv2d.cc
@@ -26,24 +26,24 @@

 #include "mace/core/future.h"
 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/depthwise_conv2d_neon.h"
-#include "mace/kernels/conv_pool_2d_base.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/depthwise_conv2d_neon.h"
+#include "mace/ops/conv_pool_2d_base.h"
 #include "mace/public/mace.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/depthwise_conv2d.h"
-#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
 public:
  explicit DepthwiseConv2dOpBase(OpConstructContext *context)
      : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -532,5 +532,5 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,9 +14,8 @@

 #include <algorithm>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -81,11 +80,11 @@ void DepthwiseConv2d(int iters,
      .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
        .Input("InputImage")
        .Input("FilterImage")

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -53,11 +53,11 @@ void SimpleValidTest() {
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -73,7 +73,7 @@ void SimpleValidTest() {

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);

  } else {
    MACE_NOT_IMPLEMENTED;
@@ -150,11 +150,11 @@ void ComplexValidTest(index_t batch,
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -170,7 +170,7 @@ void ComplexValidTest(index_t batch,

    // Transfer output
    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);

  } else {
    MACE_NOT_IMPLEMENTED;
@@ -290,11 +290,11 @@ void TestNxNS12(const index_t height, const index_t width) {
    expected->Copy(*net.GetOutput("Output"));

    BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                      kernels::BufferType::IN_OUT_CHANNEL);
+                                      ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
-                                      kernels::BufferType::DW_CONV2D_FILTER);
+                                      ops::BufferType::DW_CONV2D_FILTER);
    BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                                      kernels::BufferType::ARGUMENT);
+                                      ops::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -312,7 +312,7 @@ void TestNxNS12(const index_t height, const index_t width) {

    // Transfer output
    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
-                                          kernels::BufferType::IN_OUT_CHANNEL);
+                                          ops::BufferType::IN_OUT_CHANNEL);

    // Check
    if (DataTypeToEnum<T>::value == DT_FLOAT) {

--- a/mace/kernels/eltwise.cc
+++ b/mace/kernels/eltwise.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"

 #include <algorithm>
 #include <cmath>
@@ -26,11 +26,11 @@
 #include "mace/core/tensor.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/eltwise.h"
+#include "mace/ops/opencl/image/eltwise.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {


 inline index_t GetIndex(const std::vector<index_t> &shape,
@@ -792,8 +792,8 @@ class EltwiseOp : public Operation {
 public:
  explicit EltwiseOp(OpConstructContext *context)
      : Operation(context),
-        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        type_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE)))),
        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
@@ -934,8 +934,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 public:
  explicit EltwiseOp(OpConstructContext *context)
      : Operation(context),
-        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        type_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE)))),
        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
@@ -1076,9 +1076,9 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit EltwiseOp(OpConstructContext *context)
      : Operation(context) {
-    EltwiseType type = static_cast<kernels::EltwiseType>(
+    EltwiseType type = static_cast<ops::EltwiseType>(
        Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)));
+            "type", static_cast<int>(ops::EltwiseType::NONE)));
    std::vector<float> coeff = Operation::GetRepeatedArgs<float>("coeff");
    float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
@@ -1121,5 +1121,5 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_ELTWISE_H_
-#define MACE_KERNELS_ELTWISE_H_
+#ifndef MACE_OPS_ELTWISE_H_
+#define MACE_OPS_ELTWISE_H_

 namespace mace {
-namespace kernels {
+namespace ops {

 enum EltwiseType {
  SUM = 0,
@@ -35,7 +35,7 @@ enum EltwiseType {

 inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_ELTWISE_H_
+#endif  // MACE_OPS_ELTWISE_H_
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -14,9 +14,8 @@

 #include <string>

-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -26,7 +25,7 @@ namespace test {
 namespace {
 template <DeviceType D, typename T>
 void EltwiseBenchmark(
-    int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
+    int iters, ops::EltwiseType type, int n, int h, int w, int c) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -36,9 +35,9 @@ void EltwiseBenchmark(

  if (D == DeviceType::GPU) {
    BufferToImage<D, half>(&net, "Input0", "InputImg0",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, half>(&net, "Input1", "InputImg1",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("InputImg0")
        .Input("InputImg1")
@@ -84,7 +83,7 @@ void EltwiseBenchmark(
    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    EltwiseBenchmark<DEVICE, TYPE>(                                           \
-        iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C);      \
+        iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C);      \
  }                                                                           \
  MACE_BENCHMARK(                                                             \
      MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/kernels/expand_dims.cc
+++ b/mace/kernels/expand_dims.cc
@@ -16,7 +16,7 @@
 #include "mace/core/operator.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class ExpandDimsOp;
@@ -67,5 +67,5 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/expand_dims_test.cc
+++ b/mace/ops/expand_dims_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/kernels/fill.cc
+++ b/mace/kernels/fill.cc
@@ -16,7 +16,7 @@
 #include "mace/core/operator.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 template <DeviceType D, class T>
 class FillOp;
@@ -66,5 +66,5 @@ void RegisterFill(OpRegistryBase *op_registry) {
                   DeviceType::CPU, float);
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/fill_test.cc
+++ b/mace/ops/fill_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/kernels/fixpoint.h
+++ b/mace/kernels/fixpoint.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_FIXPOINT_H_
-#define MACE_KERNELS_FIXPOINT_H_
+#ifndef MACE_OPS_FIXPOINT_H_
+#define MACE_OPS_FIXPOINT_H_

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -23,7 +23,7 @@
 #include "mace/core/types.h"

 namespace mace {
-namespace kernels {
+namespace ops {

 inline uint8_t FindMax(const uint8_t *xs, const index_t size) {
  uint8_t max_value = 0;
@@ -57,8 +57,8 @@ inline uint8_t FindMax(const uint8_t *xs, const index_t size) {
 }


-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

-#endif  // MACE_KERNELS_FIXPOINT_H_
+#endif  // MACE_OPS_FIXPOINT_H_

--- a/mace/kernels/fixpoint_test.cc
+++ b/mace/kernels/fixpoint_test.cc
@@ -17,10 +17,10 @@
 #include <vector>
 #include <algorithm>

-#include "mace/kernels/fixpoint.h"
+#include "mace/ops/fixpoint.h"

 namespace mace {
-namespace kernels {
+namespace ops {
 namespace test {

 namespace {
@@ -49,6 +49,6 @@ TEST(FixpointTest, FindMax) {
 }

 }  // namespace test
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/kernels/fully_connected.cc
+++ b/mace/kernels/fully_connected.cc
@@ -19,22 +19,22 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/gemm.h"
+#include "mace/ops/gemmlowp_util.h"

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/fully_connected.h"
+#include "mace/ops/opencl/image/fully_connected.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
-namespace kernels {
+namespace ops {

 class FullyConnectedOpBase : public Operation {
 public:
  explicit FullyConnectedOpBase(OpConstructContext *context)
      : Operation(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -229,5 +229,5 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }

-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/kernels/gather.cc
+++ b/mace/kernels/gather.cc
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
--- a/mace/kernels/gemm.h
+++ b/mace/kernels/gemm.h
--- a/mace/kernels/gemm_test.cc
+++ b/mace/kernels/gemm_test.cc
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
--- a/mace/kernels/identity.cc
+++ b/mace/kernels/identity.cc
--- a/mace/ops/identity_test.cc
+++ b/mace/ops/identity_test.cc
--- a/mace/kernels/infer_conv2d_shape.cc
+++ b/mace/kernels/infer_conv2d_shape.cc
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
--- a/mace/kernels/local_response_norm.cc
+++ b/mace/kernels/local_response_norm.cc
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
--- a/mace/kernels/lstm_cell.cc
+++ b/mace/kernels/lstm_cell.cc
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
--- a/mace/ops/lstmcell_test_util.h
+++ b/mace/ops/lstmcell_test_util.h
--- a/mace/kernels/matmul.cc
+++ b/mace/kernels/matmul.cc
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/kernels/memory_benchmark.cc
+++ b/mace/kernels/memory_benchmark.cc
--- a/mace/kernels/opencl/activation.h
+++ b/mace/kernels/opencl/activation.h
--- a/mace/kernels/opencl/addn.h
+++ b/mace/kernels/opencl/addn.h
--- a/mace/kernels/opencl/batch_norm.h
+++ b/mace/kernels/opencl/batch_norm.h
--- a/mace/kernels/opencl/batch_to_space.h
+++ b/mace/kernels/opencl/batch_to_space.h
--- a/mace/kernels/opencl/bias_add.h
+++ b/mace/kernels/opencl/bias_add.h
--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_transform.cc
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_transform.h
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/kernels/opencl/buffer/conv_2d.h
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_general.cc
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/kernels/opencl/buffer/pooling.h
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/kernels/opencl/buffer/softmax.h
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/kernels/opencl/buffer/utils.cc
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/kernels/opencl/buffer/utils.h
--- a/mace/kernels/opencl/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer_inverse_transform.h
--- a/mace/kernels/opencl/buffer_transform.h
+++ b/mace/kernels/opencl/buffer_transform.h
--- a/mace/kernels/opencl/channel_shuffle.h
+++ b/mace/kernels/opencl/channel_shuffle.h
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
--- a/mace/kernels/opencl/cl/batch_to_space.cl
+++ b/mace/kernels/opencl/cl/batch_to_space.cl
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
--- a/mace/kernels/opencl/cl/buffer_transform.cl
+++ b/mace/kernels/opencl/cl/buffer_transform.cl
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
--- a/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
--- a/mace/kernels/opencl/cl/conv_2d_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_buffer.cl
--- a/mace/kernels/opencl/cl/crop.cl
+++ b/mace/kernels/opencl/cl/crop.cl
--- a/mace/kernels/opencl/cl/deconv_2d.cl
+++ b/mace/kernels/opencl/cl/deconv_2d.cl
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
--- a/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
--- a/mace/kernels/opencl/cl/lstmcell.cl
+++ b/mace/kernels/opencl/cl/lstmcell.cl
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
--- a/mace/kernels/opencl/cl/pad.cl
+++ b/mace/kernels/opencl/cl/pad.cl
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
--- a/mace/kernels/opencl/cl/pooling_buffer.cl
+++ b/mace/kernels/opencl/cl/pooling_buffer.cl
--- a/mace/kernels/opencl/cl/reduce_mean.cl
+++ b/mace/kernels/opencl/cl/reduce_mean.cl
--- a/mace/kernels/opencl/cl/resize_bicubic.cl
+++ b/mace/kernels/opencl/cl/resize_bicubic.cl
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
--- a/mace/kernels/opencl/cl/softmax_buffer.cl
+++ b/mace/kernels/opencl/cl/softmax_buffer.cl
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
--- a/mace/kernels/opencl/cl/space_to_depth.cl
+++ b/mace/kernels/opencl/cl/space_to_depth.cl
--- a/mace/kernels/opencl/cl/split.cl
+++ b/mace/kernels/opencl/cl/split.cl
--- a/mace/kernels/opencl/cl/sqrdiff_mean.cl
+++ b/mace/kernels/opencl/cl/sqrdiff_mean.cl
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
--- a/mace/kernels/opencl/common.h
+++ b/mace/kernels/opencl/common.h
--- a/mace/kernels/opencl/concat.h
+++ b/mace/kernels/opencl/concat.h
--- a/mace/kernels/opencl/conv_2d.h
+++ b/mace/kernels/opencl/conv_2d.h
--- a/mace/kernels/opencl/crop.h
+++ b/mace/kernels/opencl/crop.h
--- a/mace/kernels/opencl/deconv_2d.h
+++ b/mace/kernels/opencl/deconv_2d.h
--- a/mace/kernels/opencl/depth_to_space.h
+++ b/mace/kernels/opencl/depth_to_space.h
--- a/mace/kernels/opencl/depthwise_conv2d.h
+++ b/mace/kernels/opencl/depthwise_conv2d.h
--- a/mace/kernels/opencl/eltwise.h
+++ b/mace/kernels/opencl/eltwise.h
--- a/mace/kernels/opencl/fully_connected.h
+++ b/mace/kernels/opencl/fully_connected.h
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/kernels/opencl/image/activation.h
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/kernels/opencl/image/addn.h
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/kernels/opencl/image/batch_norm.h
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/kernels/opencl/image/batch_to_space.h
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/kernels/opencl/image/bias_add.h
--- a/mace/kernels/opencl/image/buffer_to_image.h
+++ b/mace/kernels/opencl/image/buffer_to_image.h
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/kernels/opencl/image/channel_shuffle.h
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/kernels/opencl/image/concat.cc
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/kernels/opencl/image/concat.h
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/kernels/opencl/image/conv_2d.h
--- a/mace/kernels/opencl/image/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/image/conv_2d_1x1.cc
--- a/mace/kernels/opencl/image/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/image/conv_2d_3x3.cc
--- a/mace/kernels/opencl/image/conv_2d_general.cc
+++ b/mace/kernels/opencl/image/conv_2d_general.cc
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/kernels/opencl/image/crop.h
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/kernels/opencl/image/deconv_2d.h
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/kernels/opencl/image/depth_to_space.h
--- a/mace/kernels/opencl/image/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/image/depthwise_conv2d.cc
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/kernels/opencl/image/depthwise_conv2d.h
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/kernels/opencl/image/eltwise.h
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/kernels/opencl/image/fully_connected.h
--- a/mace/kernels/opencl/image/image_to_buffer.h
+++ b/mace/kernels/opencl/image/image_to_buffer.h
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/kernels/opencl/image/lstm_cell.h
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/kernels/opencl/image/matmul.h
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/kernels/opencl/image/pad.h
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/kernels/opencl/image/pooling.h
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/kernels/opencl/image/reduce_mean.h
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/kernels/opencl/image/resize_bicubic.h
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/kernels/opencl/image/resize_bilinear.h
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/kernels/opencl/image/softmax.h
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/kernels/opencl/image/space_to_batch.h
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/kernels/opencl/image/space_to_depth.h
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/kernels/opencl/image/split.h
--- a/mace/kernels/opencl/image/sqrdiff_mean.h
+++ b/mace/kernels/opencl/image/sqrdiff_mean.h
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/kernels/opencl/image/winograd_transform.h
--- a/mace/kernels/opencl/lstm_cell.h
+++ b/mace/kernels/opencl/lstm_cell.h
--- a/mace/kernels/opencl/matmul.h
+++ b/mace/kernels/opencl/matmul.h
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
--- a/mace/kernels/opencl/pad.h
+++ b/mace/kernels/opencl/pad.h
--- a/mace/kernels/opencl/pooling.h
+++ b/mace/kernels/opencl/pooling.h
--- a/mace/kernels/opencl/reduce_mean.h
+++ b/mace/kernels/opencl/reduce_mean.h
--- a/mace/kernels/opencl/resize_bicubic.h
+++ b/mace/kernels/opencl/resize_bicubic.h
--- a/mace/kernels/opencl/resize_bilinear.h
+++ b/mace/kernels/opencl/resize_bilinear.h
--- a/mace/kernels/opencl/softmax.h
+++ b/mace/kernels/opencl/softmax.h
--- a/mace/kernels/opencl/space_to_batch.h
+++ b/mace/kernels/opencl/space_to_batch.h
--- a/mace/kernels/opencl/space_to_depth.h
+++ b/mace/kernels/opencl/space_to_depth.h
--- a/mace/kernels/opencl/split.h
+++ b/mace/kernels/opencl/split.h
--- a/mace/kernels/opencl/sqrdiff_mean.h
+++ b/mace/kernels/opencl/sqrdiff_mean.h
--- a/mace/kernels/opencl/winograd_transform.h
+++ b/mace/kernels/opencl/winograd_transform.h
--- a/mace/ops/ops_def_register.cc
+++ b/mace/ops/ops_def_register.cc
--- a/mace/ops/ops_def_register.h
+++ b/mace/ops/ops_def_register.h
--- a/mace/kernels/ops_register.cc
+++ b/mace/kernels/ops_register.cc
--- a/mace/kernels/ops_register.h
+++ b/mace/kernels/ops_register.h
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/kernels/pad.cc
+++ b/mace/kernels/pad.cc
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/kernels/pooling.cc
+++ b/mace/kernels/pooling.cc
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/kernels/quantize.cc
+++ b/mace/kernels/quantize.cc
--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
--- a/mace/kernels/reduce_mean.cc
+++ b/mace/kernels/reduce_mean.cc
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
--- a/mace/kernels/reshape.cc
+++ b/mace/kernels/reshape.cc
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
--- a/mace/kernels/resize_bicubic.cc
+++ b/mace/kernels/resize_bicubic.cc
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
--- a/mace/kernels/resize_bilinear.cc
+++ b/mace/kernels/resize_bilinear.cc
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/kernels/reverse.cc
+++ b/mace/kernels/reverse.cc
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
--- a/mace/ops/reverse_test.cc
+++ b/mace/ops/reverse_test.cc
--- a/mace/kernels/scalar_math.cc
+++ b/mace/kernels/scalar_math.cc
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
--- a/mace/kernels/sgemm.cc
+++ b/mace/kernels/sgemm.cc
--- a/mace/kernels/sgemm.h
+++ b/mace/kernels/sgemm.h
--- a/mace/kernels/sgemm_pack_test.cc
+++ b/mace/kernels/sgemm_pack_test.cc
--- a/mace/kernels/shape.cc
+++ b/mace/kernels/shape.cc
--- a/mace/ops/shape_test.cc
+++ b/mace/ops/shape_test.cc
--- a/mace/kernels/softmax.cc
+++ b/mace/kernels/softmax.cc
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/kernels/space_to_batch.cc
+++ b/mace/kernels/space_to_batch.cc
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
--- a/mace/kernels/space_to_depth.cc
+++ b/mace/kernels/space_to_depth.cc
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
--- a/mace/kernels/split.cc
+++ b/mace/kernels/split.cc
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
--- a/mace/kernels/sqrdiff_mean.cc
+++ b/mace/kernels/sqrdiff_mean.cc
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
--- a/mace/kernels/squeeze.cc
+++ b/mace/kernels/squeeze.cc
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
--- a/mace/kernels/stack.cc
+++ b/mace/kernels/stack.cc
--- a/mace/ops/stack_test.cc
+++ b/mace/ops/stack_test.cc
--- a/mace/kernels/strided_slice.cc
+++ b/mace/kernels/strided_slice.cc
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
--- a/mace/kernels/transpose.cc
+++ b/mace/kernels/transpose.cc
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
--- a/mace/kernels/unstack.cc
+++ b/mace/kernels/unstack.cc
--- a/mace/ops/unstack_test.cc
+++ b/mace/ops/unstack_test.cc
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/kernels/winograd_transform.cc
+++ b/mace/kernels/winograd_transform.cc
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py