From 607a1f48caca9716582609c1db57b3837a043c12 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Fri, 2 Nov 2018 14:33:07 +0800
Subject: [PATCH] Refactor: move all files in kernels directory to ops and
 remove kernels.

1. Move all files in kernels to ops
2. unify the op_def_registry and op_registry.
3. Support op clip: do not link the ops' code when the op is not registered.
---
 .gitlab-ci.yml                                |   1 -
 .travis.yml                                   |   2 -
 docs/development/adding_a_new_op.md           |  75 ++--
 docs/user_guide/advanced_usage.rst            |  63 ++-
 mace/core/allocator.h                         |   3 +-
 mace/core/net.cc                              |  20 +-
 mace/core/net.h                               |   5 +-
 mace/core/op_def_registry.cc                  |  77 ----
 mace/core/op_def_registry.h                   |  81 ----
 mace/core/operator.cc                         |  84 +++-
 mace/core/operator.h                          |  71 ++--
 mace/core/registry.h                          |  97 -----
 mace/core/tensor.h                            |   5 +-
 mace/kernels/BUILD                            | 150 -------
 mace/kernels/matmul_benchmark.cc              | 289 --------------
 mace/libmace/BUILD                            |   5 +-
 mace/libmace/mace.cc                          |   9 +-
 mace/ops/BUILD                                | 132 +++++--
 mace/{kernels => ops}/activation.cc           |  12 +-
 mace/{kernels => ops}/activation.h            |  12 +-
 mace/ops/activation_benchmark.cc              |  13 +-
 mace/ops/activation_test.cc                   |  31 +-
 mace/{kernels => ops}/addn.cc                 |   6 +-
 mace/ops/addn_benchmark.cc                    |   3 +-
 mace/ops/addn_test.cc                         |   9 +-
 mace/{kernels => ops}/argmax.cc               |   4 +-
 mace/ops/argmax_test.cc                       |   1 -
 mace/{kernels => ops}/arm/activation_neon.cc  |   6 +-
 mace/{kernels => ops}/arm/activation_neon.h   |  10 +-
 mace/{kernels => ops}/arm/conv_2d_neon.h      |  12 +-
 .../{kernels => ops}/arm/conv_2d_neon_15x1.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_1x1.cc |   6 +-
 .../{kernels => ops}/arm/conv_2d_neon_1x15.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_1x7.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_3x3.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_5x5.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_7x1.cc |   6 +-
 mace/{kernels => ops}/arm/conv_2d_neon_7x7.cc |   6 +-
 mace/{kernels => ops}/arm/conv_winograd.cc    |   8 +-
 mace/{kernels => ops}/arm/conv_winograd.h     |  12 +-
 .../arm/conv_winograd_test.cc                 |  10 +-
 mace/{kernels => ops}/arm/deconv_2d_neon.h    |  10 +-
 .../arm/deconv_2d_neon_3x3.cc                 |   6 +-
 .../arm/deconv_2d_neon_4x4.cc                 |   6 +-
 .../arm/depthwise_conv2d_neon.h               |  10 +-
 .../arm/depthwise_conv2d_neon_3x3.cc          |   6 +-
 mace/{kernels => ops}/batch_norm.cc           |  12 +-
 mace/ops/batch_norm_benchmark.cc              |  11 +-
 mace/ops/batch_norm_test.cc                   |  61 ++-
 mace/{kernels => ops}/batch_to_space.cc       |   6 +-
 mace/ops/batch_to_space_benchmark.cc          |   3 +-
 mace/{kernels => ops}/bias_add.cc             |   8 +-
 mace/ops/bias_add_benchmark.cc                |   5 +-
 mace/ops/bias_add_test.cc                     |  19 +-
 .../buffer_inverse_transform.cc               |  14 +-
 mace/ops/buffer_to_image_benchmark.cc         |   1 -
 mace/ops/buffer_to_image_test.cc              |  46 +--
 mace/{kernels => ops}/buffer_transform.cc     |  14 +-
 mace/ops/buffer_transform_test.cc             |   6 +-
 mace/{kernels => ops}/cast.cc                 |   4 +-
 mace/ops/cast_test.cc                         |   1 -
 mace/{kernels => ops}/channel_shuffle.cc      |   6 +-
 mace/ops/channel_shuffle_benchmark.cc         |   3 +-
 mace/ops/channel_shuffle_test.cc              |   5 +-
 mace/{kernels => ops}/concat.cc               |   9 +-
 mace/ops/concat_benchmark.cc                  |   5 +-
 mace/ops/concat_test.cc                       |   4 +-
 mace/{kernels => ops}/conv_2d.cc              |  26 +-
 mace/ops/conv_2d_benchmark.cc                 |   9 +-
 mace/ops/conv_2d_test.cc                      |  94 ++---
 mace/{kernels => ops}/conv_pool_2d_base.h     |  12 +-
 mace/{kernels => ops}/conv_pool_2d_util.cc    |   6 +-
 mace/{kernels => ops}/conv_pool_2d_util.h     |  10 +-
 mace/ops/core_test.cc                         |  12 +-
 mace/{kernels => ops}/crop.cc                 |   6 +-
 mace/ops/crop_benchmark.cc                    |   5 +-
 mace/ops/crop_test.cc                         |   7 +-
 mace/{kernels => ops}/deconv_2d.cc            |  22 +-
 mace/{kernels => ops}/deconv_2d.h             |  10 +-
 mace/ops/deconv_2d_benchmark.cc               |   9 +-
 mace/ops/deconv_2d_test.cc                    |  62 +--
 mace/{kernels => ops}/depth_to_space.cc       |   6 +-
 mace/ops/depth_to_space_benchmark.cc          |   3 +-
 mace/ops/depth_to_space_test.cc               |   9 +-
 mace/{kernels => ops}/depthwise_conv2d.cc     |  16 +-
 mace/ops/depthwise_conv2d_benchmark.cc        |   9 +-
 mace/ops/depthwise_conv2d_test.cc             |  26 +-
 mace/{kernels => ops}/eltwise.cc              |  20 +-
 mace/{kernels => ops}/eltwise.h               |  10 +-
 mace/ops/eltwise_benchmark.cc                 |  11 +-
 mace/ops/eltwise_test.cc                      | 343 ++++++++--------
 mace/{kernels => ops}/expand_dims.cc          |   4 +-
 mace/ops/expand_dims_test.cc                  |   1 -
 mace/{kernels => ops}/fill.cc                 |   4 +-
 mace/ops/fill_test.cc                         |   1 -
 mace/{kernels => ops}/fixpoint.h              |  10 +-
 mace/{kernels => ops}/fixpoint_test.cc        |   6 +-
 mace/ops/folded_batch_norm_test.cc            |  41 +-
 mace/{kernels => ops}/fully_connected.cc      |  14 +-
 mace/ops/fully_connected_benchmark.cc         |   7 +-
 mace/ops/fully_connected_test.cc              |  17 +-
 mace/{kernels => ops}/gather.cc               |   4 +-
 mace/ops/gather_benchmark.cc                  |   1 -
 mace/ops/gather_test.cc                       |   1 -
 mace/{kernels => ops}/gemm.cc                 |   6 +-
 mace/{kernels => ops}/gemm.h                  |  10 +-
 mace/{kernels => ops}/gemm_test.cc            |  38 +-
 mace/{kernels => ops}/gemmlowp_util.h         |   6 +-
 mace/{kernels => ops}/identity.cc             |   4 +-
 mace/ops/identity_test.cc                     |   1 -
 mace/{kernels => ops}/infer_conv2d_shape.cc   |   6 +-
 mace/ops/infer_conv2d_shape_test.cc           |   3 +-
 mace/{kernels => ops}/local_response_norm.cc  |   4 +-
 mace/ops/local_response_norm_benchmark.cc     |   1 -
 mace/ops/local_response_norm_test.cc          |   1 -
 mace/{kernels => ops}/lstm_cell.cc            |   6 +-
 mace/ops/lstmcell_benchmark.cc                |  12 +-
 mace/ops/lstmcell_test.cc                     |  17 +-
 mace/ops/lstmcell_test_util.h                 |  13 +-
 mace/{kernels => ops}/matmul.cc               |  12 +-
 mace/ops/matmul_benchmark.cc                  | 292 +++++++++++++-
 mace/ops/matmul_test.cc                       |  13 +-
 mace/{kernels => ops}/memory_benchmark.cc     |   4 +-
 mace/{kernels => ops}/opencl/activation.h     |  10 +-
 mace/{kernels => ops}/opencl/addn.h           |  10 +-
 mace/{kernels => ops}/opencl/batch_norm.h     |  10 +-
 mace/{kernels => ops}/opencl/batch_to_space.h |  10 +-
 mace/{kernels => ops}/opencl/bias_add.h       |  10 +-
 .../opencl/buffer/buffer_inverse_transform.h  |  14 +-
 .../opencl/buffer/buffer_transform.cc         |   6 +-
 .../opencl/buffer/buffer_transform.h          |  14 +-
 .../opencl/buffer/buffer_type_transform.cc    |   6 +-
 mace/{kernels => ops}/opencl/buffer/conv_2d.h |  18 +-
 .../opencl/buffer/conv_2d_1x1.cc              |   8 +-
 .../opencl/buffer/conv_2d_general.cc          |   8 +-
 .../opencl/buffer/depthwise_conv2d.cc         |   6 +-
 .../opencl/buffer/depthwise_conv2d.h          |  18 +-
 mace/{kernels => ops}/opencl/buffer/pooling.h |  18 +-
 mace/{kernels => ops}/opencl/buffer/softmax.h |  14 +-
 mace/{kernels => ops}/opencl/buffer/utils.cc  |   8 +-
 mace/{kernels => ops}/opencl/buffer/utils.h   |  10 +-
 .../opencl/buffer_inverse_transform.h         |  12 +-
 .../opencl/buffer_transform.h                 |  12 +-
 .../{kernels => ops}/opencl/channel_shuffle.h |  10 +-
 mace/{kernels => ops}/opencl/cl/activation.cl |   0
 mace/{kernels => ops}/opencl/cl/addn.cl       |   0
 mace/{kernels => ops}/opencl/cl/batch_norm.cl |   0
 .../opencl/cl/batch_to_space.cl               |   0
 mace/{kernels => ops}/opencl/cl/bias_add.cl   |   0
 .../opencl/cl/buffer_to_image.cl              |   0
 .../opencl/cl/buffer_transform.cl             |   0
 .../opencl/cl/channel_shuffle.cl              |   0
 mace/{kernels => ops}/opencl/cl/common.h      |   6 +-
 mace/{kernels => ops}/opencl/cl/concat.cl     |   0
 mace/{kernels => ops}/opencl/cl/conv_2d.cl    |   0
 .../{kernels => ops}/opencl/cl/conv_2d_1x1.cl |   0
 .../opencl/cl/conv_2d_1x1_buffer.cl           |   0
 .../{kernels => ops}/opencl/cl/conv_2d_3x3.cl |   0
 .../opencl/cl/conv_2d_buffer.cl               |   0
 mace/{kernels => ops}/opencl/cl/crop.cl       |   0
 mace/{kernels => ops}/opencl/cl/deconv_2d.cl  |   0
 .../opencl/cl/depth_to_space.cl               |   0
 .../opencl/cl/depthwise_conv2d.cl             |   0
 .../opencl/cl/depthwise_conv2d_buffer.cl      |   0
 mace/{kernels => ops}/opencl/cl/eltwise.cl    |   0
 .../opencl/cl/fully_connected.cl              |   0
 mace/{kernels => ops}/opencl/cl/lstmcell.cl   |   0
 mace/{kernels => ops}/opencl/cl/matmul.cl     |   0
 mace/{kernels => ops}/opencl/cl/pad.cl        |   0
 mace/{kernels => ops}/opencl/cl/pooling.cl    |   0
 .../opencl/cl/pooling_buffer.cl               |   0
 .../{kernels => ops}/opencl/cl/reduce_mean.cl |   0
 .../opencl/cl/resize_bicubic.cl               |   0
 .../opencl/cl/resize_bilinear.cl              |   0
 mace/{kernels => ops}/opencl/cl/softmax.cl    |   0
 .../opencl/cl/softmax_buffer.cl               |   0
 .../opencl/cl/space_to_batch.cl               |   0
 .../opencl/cl/space_to_depth.cl               |   0
 mace/{kernels => ops}/opencl/cl/split.cl      |   0
 .../opencl/cl/sqrdiff_mean.cl                 |   0
 .../opencl/cl/winograd_transform.cl           |   0
 mace/{kernels => ops}/opencl/common.h         |  10 +-
 mace/{kernels => ops}/opencl/concat.h         |  10 +-
 mace/{kernels => ops}/opencl/conv_2d.h        |  14 +-
 mace/{kernels => ops}/opencl/crop.h           |  10 +-
 mace/{kernels => ops}/opencl/deconv_2d.h      |  12 +-
 mace/{kernels => ops}/opencl/depth_to_space.h |  10 +-
 .../opencl/depthwise_conv2d.h                 |  14 +-
 mace/{kernels => ops}/opencl/eltwise.h        |  10 +-
 .../{kernels => ops}/opencl/fully_connected.h |  12 +-
 mace/{kernels => ops}/opencl/helper.cc        |   6 +-
 mace/{kernels => ops}/opencl/helper.h         |  40 +-
 .../opencl/image/activation.h                 |  16 +-
 mace/{kernels => ops}/opencl/image/addn.h     |  14 +-
 .../opencl/image/batch_norm.h                 |  16 +-
 .../opencl/image/batch_to_space.h             |  14 +-
 mace/{kernels => ops}/opencl/image/bias_add.h |  14 +-
 .../opencl/image/buffer_to_image.h            |  14 +-
 .../opencl/image/channel_shuffle.h            |  14 +-
 mace/{kernels => ops}/opencl/image/concat.cc  |   6 +-
 mace/{kernels => ops}/opencl/image/concat.h   |  14 +-
 mace/{kernels => ops}/opencl/image/conv_2d.h  |  16 +-
 .../opencl/image/conv_2d_1x1.cc               |   8 +-
 .../opencl/image/conv_2d_3x3.cc               |   8 +-
 .../opencl/image/conv_2d_general.cc           |   8 +-
 mace/{kernels => ops}/opencl/image/crop.h     |  14 +-
 .../{kernels => ops}/opencl/image/deconv_2d.h |  14 +-
 .../opencl/image/depth_to_space.h             |  14 +-
 .../opencl/image/depthwise_conv2d.cc          |   6 +-
 .../opencl/image/depthwise_conv2d.h           |  16 +-
 mace/{kernels => ops}/opencl/image/eltwise.h  |  16 +-
 .../opencl/image/fully_connected.h            |  14 +-
 .../opencl/image/image_to_buffer.h            |  14 +-
 .../{kernels => ops}/opencl/image/lstm_cell.h |  14 +-
 mace/{kernels => ops}/opencl/image/matmul.h   |  14 +-
 mace/{kernels => ops}/opencl/image/pad.h      |  14 +-
 mace/{kernels => ops}/opencl/image/pooling.h  |  16 +-
 .../opencl/image/reduce_mean.h                |  14 +-
 .../opencl/image/resize_bicubic.h             |  22 +-
 .../opencl/image/resize_bilinear.h            |  20 +-
 mace/{kernels => ops}/opencl/image/softmax.h  |  14 +-
 .../opencl/image/space_to_batch.h             |  14 +-
 .../opencl/image/space_to_depth.h             |  14 +-
 mace/{kernels => ops}/opencl/image/split.h    |  14 +-
 .../opencl/image/sqrdiff_mean.h               |  14 +-
 .../opencl/image/winograd_transform.h         |  20 +-
 mace/{kernels => ops}/opencl/lstm_cell.h      |  10 +-
 mace/{kernels => ops}/opencl/matmul.h         |  10 +-
 .../opencl/out_of_range_check_test.cc         |   6 +-
 mace/{kernels => ops}/opencl/pad.h            |  10 +-
 mace/{kernels => ops}/opencl/pooling.h        |  14 +-
 mace/{kernels => ops}/opencl/reduce_mean.h    |  10 +-
 mace/{kernels => ops}/opencl/resize_bicubic.h |  10 +-
 .../{kernels => ops}/opencl/resize_bilinear.h |  10 +-
 mace/{kernels => ops}/opencl/softmax.h        |  10 +-
 mace/{kernels => ops}/opencl/space_to_batch.h |  10 +-
 mace/{kernels => ops}/opencl/space_to_depth.h |  10 +-
 mace/{kernels => ops}/opencl/split.h          |  10 +-
 mace/{kernels => ops}/opencl/sqrdiff_mean.h   |  10 +-
 .../opencl/winograd_transform.h               |  10 +-
 mace/ops/ops_def_register.cc                  | 373 ------------------
 mace/ops/ops_def_register.h                   |  30 --
 .../ops_register.cc => ops/ops_registry.cc}   | 108 ++---
 .../ops_register.h => ops/ops_registry.h}     |   6 +-
 mace/ops/ops_test_util.h                      |  15 +-
 mace/{kernels => ops}/pad.cc                  |   6 +-
 mace/ops/pad_benchmark.cc                     |   4 +-
 mace/ops/pad_test.cc                          |   9 +-
 mace/{kernels => ops}/pooling.cc              |  16 +-
 mace/{kernels => ops}/pooling.h               |   6 +-
 mace/ops/pooling_benchmark.cc                 |   7 +-
 mace/ops/pooling_test.cc                      |  23 +-
 mace/{kernels => ops}/quantize.cc             |   4 +-
 mace/ops/quantize_test.cc                     |   1 -
 mace/{kernels => ops}/reduce_mean.cc          |   6 +-
 mace/ops/reduce_mean_benchmark.cc             |   4 +-
 mace/ops/reduce_mean_test.cc                  |   9 +-
 mace/{kernels => ops}/reshape.cc              |   4 +-
 mace/ops/reshape_test.cc                      |   1 -
 mace/{kernels => ops}/resize_bicubic.cc       |   8 +-
 mace/{kernels => ops}/resize_bicubic.h        |  10 +-
 mace/ops/resize_bicubic_benchmark.cc          |   3 +-
 mace/ops/resize_bicubic_test.cc               |   5 +-
 mace/{kernels => ops}/resize_bilinear.cc      |   8 +-
 mace/{kernels => ops}/resize_bilinear.h       |  10 +-
 mace/ops/resize_bilinear_benchmark.cc         |   3 +-
 mace/ops/resize_bilinear_test.cc              |   5 +-
 mace/{kernels => ops}/reverse.cc              |   4 +-
 mace/ops/reverse_benchmark.cc                 |   2 -
 mace/ops/reverse_test.cc                      |   1 -
 mace/{kernels => ops}/scalar_math.cc          |  10 +-
 mace/ops/scalar_math_test.cc                  |  51 ++-
 mace/{kernels => ops}/sgemm.cc                |  14 +-
 mace/{kernels => ops}/sgemm.h                 |  10 +-
 mace/{kernels => ops}/sgemm_pack_test.cc      |   6 +-
 mace/{kernels => ops}/shape.cc                |   4 +-
 mace/ops/shape_test.cc                        |   1 -
 mace/{kernels => ops}/softmax.cc              |  12 +-
 mace/ops/softmax_benchmark.cc                 |   3 +-
 mace/ops/softmax_test.cc                      |   9 +-
 mace/{kernels => ops}/space_to_batch.cc       |   6 +-
 mace/ops/space_to_batch_benchmark.cc          |   3 +-
 mace/ops/space_to_batch_test.cc               |  16 +-
 mace/{kernels => ops}/space_to_depth.cc       |   6 +-
 mace/ops/space_to_depth_benchmark.cc          |   3 +-
 mace/ops/space_to_depth_test.cc               |   9 +-
 mace/{kernels => ops}/split.cc                |   6 +-
 mace/ops/split_benchmark.cc                   |   3 +-
 mace/ops/split_test.cc                        |   4 +-
 mace/{kernels => ops}/sqrdiff_mean.cc         |   6 +-
 mace/ops/sqrdiff_mean_benchmark.cc            |   4 +-
 mace/ops/sqrdiff_mean_test.cc                 |  12 +-
 mace/{kernels => ops}/squeeze.cc              |   4 +-
 mace/ops/squeeze_test.cc                      |   1 -
 mace/{kernels => ops}/stack.cc                |   4 +-
 mace/ops/stack_test.cc                        |   1 -
 mace/{kernels => ops}/strided_slice.cc        |   4 +-
 mace/ops/strided_slice_test.cc                |   1 -
 mace/{kernels => ops}/transpose.cc            |   4 +-
 mace/ops/transpose_benchmark.cc               |   1 -
 mace/ops/transpose_test.cc                    |   1 -
 mace/{kernels => ops}/unstack.cc              |   4 +-
 mace/ops/unstack_test.cc                      |   1 -
 mace/ops/winograd_convolution_benchmark.cc    |  11 +-
 mace/ops/winograd_convolution_test.cc         |  27 +-
 mace/{kernels => ops}/winograd_transform.cc   |  12 +-
 mace/ops/winograd_transform_benchmark.cc      |  11 +-
 mace/proto/mace.proto                         |   1 -
 mace/python/tools/converter.py                |   3 -
 .../tools/converter_tool/transformer.py       |  10 -
 mace/python/tools/encrypt_opencl_codegen.py   |   4 +-
 mace/python/tools/model.jinja2                |   9 -
 mace/test/mace_api_mt_test.cc                 |  13 +-
 mace/test/mace_api_test.cc                    |  13 +-
 .../opencl-kernel/opencl_kernel_configure.bzl |  78 ++--
 tools/bazel.rc                                |   1 +
 tools/converter.py                            |   2 +-
 tools/sh_commands.py                          |   4 +-
 318 files changed, 2118 insertions(+), 2946 deletions(-)
 delete mode 100644 mace/core/op_def_registry.cc
 delete mode 100644 mace/core/op_def_registry.h
 delete mode 100644 mace/core/registry.h
 delete mode 100644 mace/kernels/BUILD
 delete mode 100644 mace/kernels/matmul_benchmark.cc
 rename mace/{kernels => ops}/activation.cc (93%)
 rename mace/{kernels => ops}/activation.h (95%)
 rename mace/{kernels => ops}/addn.cc (98%)
 rename mace/{kernels => ops}/argmax.cc (98%)
 rename mace/{kernels => ops}/arm/activation_neon.cc (95%)
 rename mace/{kernels => ops}/arm/activation_neon.h (82%)
 rename mace/{kernels => ops}/arm/conv_2d_neon.h (95%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_15x1.cc (98%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_1x1.cc (94%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_1x15.cc (98%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_1x7.cc (99%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_3x3.cc (99%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_5x5.cc (99%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_7x1.cc (99%)
 rename mace/{kernels => ops}/arm/conv_2d_neon_7x7.cc (99%)
 rename mace/{kernels => ops}/arm/conv_winograd.cc (99%)
 rename mace/{kernels => ops}/arm/conv_winograd.h (92%)
 rename mace/{kernels => ops}/arm/conv_winograd_test.cc (91%)
 rename mace/{kernels => ops}/arm/deconv_2d_neon.h (93%)
 rename mace/{kernels => ops}/arm/deconv_2d_neon_3x3.cc (99%)
 rename mace/{kernels => ops}/arm/deconv_2d_neon_4x4.cc (99%)
 rename mace/{kernels => ops}/arm/depthwise_conv2d_neon.h (90%)
 rename mace/{kernels => ops}/arm/depthwise_conv2d_neon_3x3.cc (99%)
 rename mace/{kernels => ops}/batch_norm.cc (96%)
 rename mace/{kernels => ops}/batch_to_space.cc (99%)
 rename mace/{kernels => ops}/bias_add.cc (97%)
 rename mace/{kernels => ops}/buffer_inverse_transform.cc (85%)
 rename mace/{kernels => ops}/buffer_transform.cc (84%)
 rename mace/{kernels => ops}/cast.cc (97%)
 rename mace/{kernels => ops}/channel_shuffle.cc (97%)
 rename mace/{kernels => ops}/concat.cc (97%)
 rename mace/{kernels => ops}/conv_2d.cc (98%)
 rename mace/{kernels => ops}/conv_pool_2d_base.h (85%)
 rename mace/{kernels => ops}/conv_pool_2d_util.cc (99%)
 rename mace/{kernels => ops}/conv_pool_2d_util.h (96%)
 rename mace/{kernels => ops}/crop.cc (98%)
 rename mace/{kernels => ops}/deconv_2d.cc (97%)
 rename mace/{kernels => ops}/deconv_2d.h (82%)
 rename mace/{kernels => ops}/depth_to_space.cc (97%)
 rename mace/{kernels => ops}/depthwise_conv2d.cc (98%)
 rename mace/{kernels => ops}/eltwise.cc (98%)
 rename mace/{kernels => ops}/eltwise.h (86%)
 rename mace/{kernels => ops}/expand_dims.cc (98%)
 rename mace/{kernels => ops}/fill.cc (97%)
 rename mace/{kernels => ops}/fixpoint.h (92%)
 rename mace/{kernels => ops}/fixpoint_test.cc (94%)
 rename mace/{kernels => ops}/fully_connected.cc (96%)
 rename mace/{kernels => ops}/gather.cc (98%)
 rename mace/{kernels => ops}/gemm.cc (99%)
 rename mace/{kernels => ops}/gemm.h (94%)
 rename mace/{kernels => ops}/gemm_test.cc (84%)
 rename mace/{kernels => ops}/gemmlowp_util.h (96%)
 rename mace/{kernels => ops}/identity.cc (97%)
 rename mace/{kernels => ops}/infer_conv2d_shape.cc (97%)
 rename mace/{kernels => ops}/local_response_norm.cc (98%)
 rename mace/{kernels => ops}/lstm_cell.cc (95%)
 rename mace/{kernels => ops}/matmul.cc (98%)
 rename mace/{kernels => ops}/memory_benchmark.cc (98%)
 rename mace/{kernels => ops}/opencl/activation.h (85%)
 rename mace/{kernels => ops}/opencl/addn.h (86%)
 rename mace/{kernels => ops}/opencl/batch_norm.h (85%)
 rename mace/{kernels => ops}/opencl/batch_to_space.h (86%)
 rename mace/{kernels => ops}/opencl/bias_add.h (85%)
 rename mace/{kernels => ops}/opencl/buffer/buffer_inverse_transform.h (85%)
 rename mace/{kernels => ops}/opencl/buffer/buffer_transform.cc (98%)
 rename mace/{kernels => ops}/opencl/buffer/buffer_transform.h (90%)
 rename mace/{kernels => ops}/opencl/buffer/buffer_type_transform.cc (97%)
 rename mace/{kernels => ops}/opencl/buffer/conv_2d.h (95%)
 rename mace/{kernels => ops}/opencl/buffer/conv_2d_1x1.cc (97%)
 rename mace/{kernels => ops}/opencl/buffer/conv_2d_general.cc (97%)
 rename mace/{kernels => ops}/opencl/buffer/depthwise_conv2d.cc (98%)
 rename mace/{kernels => ops}/opencl/buffer/depthwise_conv2d.h (94%)
 rename mace/{kernels => ops}/opencl/buffer/pooling.h (95%)
 rename mace/{kernels => ops}/opencl/buffer/softmax.h (93%)
 rename mace/{kernels => ops}/opencl/buffer/utils.cc (96%)
 rename mace/{kernels => ops}/opencl/buffer/utils.h (86%)
 rename mace/{kernels => ops}/opencl/buffer_inverse_transform.h (81%)
 rename mace/{kernels => ops}/opencl/buffer_transform.h (82%)
 rename mace/{kernels => ops}/opencl/channel_shuffle.h (83%)
 rename mace/{kernels => ops}/opencl/cl/activation.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/addn.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/batch_norm.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/batch_to_space.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/bias_add.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/buffer_to_image.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/buffer_transform.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/channel_shuffle.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/common.h (96%)
 rename mace/{kernels => ops}/opencl/cl/concat.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/conv_2d.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/conv_2d_1x1.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/conv_2d_1x1_buffer.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/conv_2d_3x3.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/conv_2d_buffer.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/crop.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/deconv_2d.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/depth_to_space.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/depthwise_conv2d.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/depthwise_conv2d_buffer.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/eltwise.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/fully_connected.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/lstmcell.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/matmul.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/pad.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/pooling.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/pooling_buffer.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/reduce_mean.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/resize_bicubic.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/resize_bilinear.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/softmax.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/softmax_buffer.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/space_to_batch.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/space_to_depth.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/split.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/sqrdiff_mean.cl (100%)
 rename mace/{kernels => ops}/opencl/cl/winograd_transform.cl (100%)
 rename mace/{kernels => ops}/opencl/common.h (84%)
 rename mace/{kernels => ops}/opencl/concat.h (86%)
 rename mace/{kernels => ops}/opencl/conv_2d.h (82%)
 rename mace/{kernels => ops}/opencl/crop.h (86%)
 rename mace/{kernels => ops}/opencl/deconv_2d.h (84%)
 rename mace/{kernels => ops}/opencl/depth_to_space.h (83%)
 rename mace/{kernels => ops}/opencl/depthwise_conv2d.h (81%)
 rename mace/{kernels => ops}/opencl/eltwise.h (85%)
 rename mace/{kernels => ops}/opencl/fully_connected.h (82%)
 rename mace/{kernels => ops}/opencl/helper.cc (99%)
 rename mace/{kernels => ops}/opencl/helper.h (85%)
 rename mace/{kernels => ops}/opencl/image/activation.h (93%)
 rename mace/{kernels => ops}/opencl/image/addn.h (94%)
 rename mace/{kernels => ops}/opencl/image/batch_norm.h (94%)
 rename mace/{kernels => ops}/opencl/image/batch_to_space.h (93%)
 rename mace/{kernels => ops}/opencl/image/bias_add.h (93%)
 rename mace/{kernels => ops}/opencl/image/buffer_to_image.h (95%)
 rename mace/{kernels => ops}/opencl/image/channel_shuffle.h (93%)
 rename mace/{kernels => ops}/opencl/image/concat.cc (98%)
 rename mace/{kernels => ops}/opencl/image/concat.h (93%)
 rename mace/{kernels => ops}/opencl/image/conv_2d.h (95%)
 rename mace/{kernels => ops}/opencl/image/conv_2d_1x1.cc (98%)
 rename mace/{kernels => ops}/opencl/image/conv_2d_3x3.cc (98%)
 rename mace/{kernels => ops}/opencl/image/conv_2d_general.cc (98%)
 rename mace/{kernels => ops}/opencl/image/crop.h (96%)
 rename mace/{kernels => ops}/opencl/image/deconv_2d.h (96%)
 rename mace/{kernels => ops}/opencl/image/depth_to_space.h (94%)
 rename mace/{kernels => ops}/opencl/image/depthwise_conv2d.cc (98%)
 rename mace/{kernels => ops}/opencl/image/depthwise_conv2d.h (92%)
 rename mace/{kernels => ops}/opencl/image/eltwise.h (95%)
 rename mace/{kernels => ops}/opencl/image/fully_connected.h (95%)
 rename mace/{kernels => ops}/opencl/image/image_to_buffer.h (95%)
 rename mace/{kernels => ops}/opencl/image/lstm_cell.h (94%)
 rename mace/{kernels => ops}/opencl/image/matmul.h (93%)
 rename mace/{kernels => ops}/opencl/image/pad.h (94%)
 rename mace/{kernels => ops}/opencl/image/pooling.h (95%)
 rename mace/{kernels => ops}/opencl/image/reduce_mean.h (95%)
 rename mace/{kernels => ops}/opencl/image/resize_bicubic.h (91%)
 rename mace/{kernels => ops}/opencl/image/resize_bilinear.h (92%)
 rename mace/{kernels => ops}/opencl/image/softmax.h (94%)
 rename mace/{kernels => ops}/opencl/image/space_to_batch.h (93%)
 rename mace/{kernels => ops}/opencl/image/space_to_depth.h (94%)
 rename mace/{kernels => ops}/opencl/image/split.h (95%)
 rename mace/{kernels => ops}/opencl/image/sqrdiff_mean.h (95%)
 rename mace/{kernels => ops}/opencl/image/winograd_transform.h (96%)
 rename mace/{kernels => ops}/opencl/lstm_cell.h (86%)
 rename mace/{kernels => ops}/opencl/matmul.h (86%)
 rename mace/{kernels => ops}/opencl/out_of_range_check_test.cc (98%)
 rename mace/{kernels => ops}/opencl/pad.h (86%)
 rename mace/{kernels => ops}/opencl/pooling.h (82%)
 rename mace/{kernels => ops}/opencl/reduce_mean.h (84%)
 rename mace/{kernels => ops}/opencl/resize_bicubic.h (84%)
 rename mace/{kernels => ops}/opencl/resize_bilinear.h (84%)
 rename mace/{kernels => ops}/opencl/softmax.h (85%)
 rename mace/{kernels => ops}/opencl/space_to_batch.h (86%)
 rename mace/{kernels => ops}/opencl/space_to_depth.h (83%)
 rename mace/{kernels => ops}/opencl/split.h (86%)
 rename mace/{kernels => ops}/opencl/sqrdiff_mean.h (84%)
 rename mace/{kernels => ops}/opencl/winograd_transform.h (86%)
 delete mode 100644 mace/ops/ops_def_register.cc
 delete mode 100644 mace/ops/ops_def_register.h
 rename mace/{kernels/ops_register.cc => ops/ops_registry.cc} (67%)
 rename mace/{kernels/ops_register.h => ops/ops_registry.h} (87%)
 rename mace/{kernels => ops}/pad.cc (98%)
 rename mace/{kernels => ops}/pooling.cc (98%)
 rename mace/{kernels => ops}/pooling.h (87%)
 rename mace/{kernels => ops}/quantize.cc (98%)
 rename mace/{kernels => ops}/reduce_mean.cc (98%)
 rename mace/{kernels => ops}/reshape.cc (98%)
 rename mace/{kernels => ops}/resize_bicubic.cc (98%)
 rename mace/{kernels => ops}/resize_bicubic.h (87%)
 rename mace/{kernels => ops}/resize_bilinear.cc (98%)
 rename mace/{kernels => ops}/resize_bilinear.h (86%)
 rename mace/{kernels => ops}/reverse.cc (98%)
 rename mace/{kernels => ops}/scalar_math.cc (95%)
 rename mace/{kernels => ops}/sgemm.cc (99%)
 rename mace/{kernels => ops}/sgemm.h (96%)
 rename mace/{kernels => ops}/sgemm_pack_test.cc (98%)
 rename mace/{kernels => ops}/shape.cc (98%)
 rename mace/{kernels => ops}/softmax.cc (98%)
 rename mace/{kernels => ops}/space_to_batch.cc (99%)
 rename mace/{kernels => ops}/space_to_depth.cc (97%)
 rename mace/{kernels => ops}/split.cc (98%)
 rename mace/{kernels => ops}/sqrdiff_mean.cc (97%)
 rename mace/{kernels => ops}/squeeze.cc (97%)
 rename mace/{kernels => ops}/stack.cc (98%)
 rename mace/{kernels => ops}/strided_slice.cc (99%)
 rename mace/{kernels => ops}/transpose.cc (99%)
 rename mace/{kernels => ops}/unstack.cc (98%)
 rename mace/{kernels => ops}/winograd_transform.cc (93%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 65623c78..d7fc2ec5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -67,7 +67,6 @@ extra_tests:
   stage: extra_tests
   script:
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
 
 platform_compatible_tests:
diff --git a/.travis.yml b/.travis.yml
index 19a336d7..b6354913 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -94,7 +94,6 @@ jobs:
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1
         - echo 'Extra Test'
-        - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=False --target_abis=armeabi-v7a || exit 1
         - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1
       env: TYPE=Extra-Test-ARMEABI-v7a
       os: linux
@@ -106,7 +105,6 @@ jobs:
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=arm64-v8a || exit 1
         - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=arm64-v8a || exit 1
         - echo 'Extra Test on ARM64'
-        - python tools/bazel_adb_run.py --target="//mace/kernels:kernels_test" --run_target=False --target_abis=arm64-v8a || exit 1
         - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=arm64-v8a || exit 1
       env: TYPE=Extra-Test-ARM64-v8a
       os: linux
diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md
index 33a1a60d..3e461671 100644
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -5,46 +5,24 @@ You can create a custom op if it is not supported yet.
 
 To add a custom op, you need to follow these steps:
 
-Register the new OpDef information
-----------------------------------
-Register the OpDef information about which devices the operation could run on.
-Registry file is in `mace/ops/ops_def_register.cc`
-```c++
-#include "mace/ops/ops_def_register.h"
-
-namespace mace {
-namespace ops {
-
-void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("MyCustomOp")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-  ......
-}
-}  // namespace ops
-}  // namespace mace
-
-```
-
 Implement the Operation
 -----------------------
-The Best way is to refer to the implementation of other operator(e.g. `/mace/kernels/activation.cc`)
-
-Define the new Op class in `mace/kernels/my_custom_op.cc`.
-1. CPU code: just write the code in `mace/kernels/my_custom_op.cc`.
-2. GPU code: Kernel API is defined in `mace/kernels/my_custom_op.h`, 
-Kernel based on Image is realized in `mace/kernels/opencl/image/my_custom_op.cc`,
-Kernel based on Buffer is realized in `mace/kernels/opencl/buffer/my_custom_op.cc`.
+The Best way is to refer to the implementation of other operator(e.g. `/mace/ops/activation.cc`)
+
+Define the new Op class in `mace/ops/my_custom_op.cc`.
+1. ARM kernels: Kernel about NEON is located at `mace/ops/arm/my_custom_op.cc`
+2. GPU kernels: OpenCL kernel API is defined in `mace/ops/opencl/my_custom_op.h`, 
+    * Kernel based on Image is realized in `mace/ops/opencl/image/my_custom_op.cc`,
+    * Kernel based on Buffer is realized in `mace/ops/opencl/buffer/my_custom_op.cc`.
+    * OpenCL kernel file is realized in `mace/ops/opencl/cl/my_custom_op.cl`.
+    * Add the path of opencl kernel file in file `mace/repository/opencl-kernel/opencl_kernel_configure.bzl`
  
-The structure like the following code.
+The structure of Op is like the following code.
 ```c++
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class MyCustomOp;
@@ -56,43 +34,34 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
 
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationOp<DeviceType::GPU, T> : public Operation {
+class MyCustomOp<DeviceType::GPU, T> : public Operation {
 ...
 };
 #endif  // MACE_ENABLE_OPENCL
 
-}  // namespace ops
-}  // namespace mace
-
-```
-
-Register the Operation
------------------------
-1, Add register function in `mace/kernels/my_custom_op.cc`
-```c++
-#include "mace/core/operator.h"
-
-namespace mace {
-namespace kernels {
-
 void RegisterMyCustomOp(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                    DeviceType::CPU, float);
 
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                    DeviceType::GPU, float);
 
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                    DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
 }
+
 }  // namespace ops
 }  // namespace mace
+
 ```
-2, And then register the new Op in `mace/kernels/ops_register.cc`.
+
+Register the Operation
+-----------------------
+Register the new Op in `mace/ops/ops_register.cc`.
 ```
-#include "mace/kernels/ops_register.h"
+#include "mace/ops/ops_register.h"
 
 namespace mace {
 namespace ops {
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index e6284be9..13c3c211 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -112,7 +112,8 @@ There are two common advanced use cases:
   - converting model to C++ code.
   - tuning GPU kernels for a specific SoC.
 
-* **Convert model(s) to C++ code**
+Convert model(s) to C++ code
+--------------------------------
 
     * **1. Change the model deployment file(.yml)**
 
@@ -204,7 +205,8 @@ There are two common advanced use cases:
             // ... Same with the code in basic usage
 
 
-* **Tuning for specific SoC's GPU**
+Tuning for specific SoC's GPU
+---------------------------------
 
     If you want to use the GPU of a specific device, you can just specify the ``target_socs`` in your YAML file and
     then tune the MACE lib for it (OpenCL kernels), which may get 1~10% performance improvement.
@@ -375,25 +377,52 @@ Use ``-h`` to get detailed help.
 
 Reduce Library Size
 -------------------
-* **dynamic library**
+* Build for your own usage purpose.
+    * **dynamic library**
 
-    The generated dynamic library by script ``tools/build-standalone-lib.sh`` is about ``1.6M`` for
-    ``armeabi-v7a`` and ``2.1M`` for ``arm64-v8a``. It can be reduced by modifying some build options.
+        - If the models don't need to run on device ``dsp``, change the build option ``--define hexagon=true``
+          to ``false``. And the library will be decreased about ``100KB``.
 
-    - If the models don't need to run on device ``dsp``, change the build option ``--define hexagon=true``
-      to ``false``. And the library will be decreased about ``100KB``.
+        - Futher more, if only ``cpu`` device needed, change ``--define opencl=true`` to ``false``. This way
+          will reduce half of library size to about ``700KB`` for ``armeabi-v7a`` and ``1000KB`` for ``arm64-v8a``
 
-    - Futher more, if only ``cpu`` device needed, change ``--define opencl=true`` to ``false``. This way
-      will reduce half of library size to about ``700KB`` for ``armeabi-v7a`` and ``1000KB`` for ``arm64-v8a``
+        - About ``300KB`` can be reduced when add ``--config symbol_hidden`` building option. It will change
+          the visibility of inner apis in libmace.so and lead to linking error when load model(s) in ``code``
+          but no effection for ``file`` mode.
 
-    - About ``300KB`` can be reduced when add ``--config symbol_hidden`` building option. It will change
-      the visibility of inner apis in libmace.so and lead to linking error when load model(s) in ``code``
-      but no effection for ``file`` mode.
+    * **static library**
 
-* **static library**
+        - The methods in dynamic library can be useful for static library too. In additional, the static
+          library may also contain model graph and model datas if the configs ``model_graph_format`` and
+          ``model_data_format`` in deployment file are set to ``code``.
 
-    - The methods in dynamic library can be useful for static library too. In additional, the static
-      library may also contain model graph and model datas if the configs ``model_graph_format`` and
-      ``model_data_format`` in deployment file are set to ``code``.
+        - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.
 
-    - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.
+* Remove the unused ops.
+Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
+which will reduce the library size significantly. the final binary just link the registered ops' code.
+```
+#include "mace/ops/ops_register.h"
+
+namespace mace {
+namespace ops {
+// Just leave the ops used in your models
+
+...
+
+}  // namespace ops
+
+
+OpRegistry::OpRegistry() : OpRegistryBase() {
+// Just leave the ops used in your models
+
+  ...
+
+  ops::RegisterMyCustomOp(this);
+
+  ...
+
+}
+
+}  // namespace mace
+```
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 51f04741..d1101413 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -23,9 +23,10 @@
 #include <cstring>
 
 #include "mace/core/macros.h"
-#include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
 
 namespace mace {
 
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 757b4831..63ca5792 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -27,8 +27,7 @@
 
 namespace mace {
 
-SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
-                     const OpRegistryBase *op_registry,
+SerialNet::SerialNet(const OpRegistryBase *op_registry,
                      const NetDef *net_def,
                      Workspace *ws,
                      Device *target_device,
@@ -41,15 +40,7 @@ SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
                         target_device->cpu_runtime()->policy(),
                         target_device->cpu_runtime()->use_gemmlowp())) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // Register Operations
-  MaceStatus status;
-  for (int idx = 0; idx < net_def->op_types_size(); ++idx) {
-    status = op_def_registry->Register(net_def->op_types(idx));
-    MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
-  }
   // Create Operations
-  operators_.clear();
-  const OpRegistrationInfo *info;
   DeviceType target_device_type = target_device_->device_type();
   OpConstructContext construct_context(ws_);
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
@@ -59,16 +50,13 @@ SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
         ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
             operator_def, "device", static_cast<int>(target_device_type));
     if (op_device == target_device_type) {
-      // Find op registration information
-      status = op_def_registry->Find(operator_def.type(), &info);
-      MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
       // Get available devices (sorted based on priority)
       OperatorDef temp_def(operator_def);
-      auto available_devices = info->device_place_func_();
+      auto available_devices = op_registry->AvailableDevices(temp_def.type());
       // Find the device type to run the op.
       // If the target_device_type in available devices, use target_device_type,
-      // otherwise, fallback to the first device (top priority).
-      DeviceType device_type = available_devices[0];
+      // otherwise, fallback to CPU device.
+      DeviceType device_type = DeviceType::CPU;
       construct_context.set_device(cpu_device_);
       for (auto device : available_devices) {
         if (device == target_device_type) {
diff --git a/mace/core/net.h b/mace/core/net.h
index 799e07d4..d5a6725f 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,8 +21,6 @@
 #include <unordered_map>
 #include <sstream>
 
-#include "mace/core/op_def_registry.h"
-
 #include "mace/core/operator.h"
 
 namespace mace {
@@ -45,8 +43,7 @@ class NetBase {
 
 class SerialNet : public NetBase {
  public:
-  SerialNet(OpDefRegistryBase *op_def_registry,
-            const OpRegistryBase *op_registry,
+  SerialNet(const OpRegistryBase *op_registry,
             const NetDef *net_def,
             Workspace *ws,
             Device *target_device,
diff --git a/mace/core/op_def_registry.cc b/mace/core/op_def_registry.cc
deleted file mode 100644
index 7bb8de9e..00000000
--- a/mace/core/op_def_registry.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/op_def_registry.h"
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-void AddOpRegistrar(OpDefRegistryBase *registry,
-                    const OpRegistrationBuilder &builder) {
-  registry->AddRegistrar(
-      builder.name(),
-      [builder](OpRegistrationInfo *info){
-        builder.Finalize(info);
-      });
-}
-
-OpRegistrationBuilder::OpRegistrationBuilder(const std::string name)
-    : name_(name) {}
-
-const std::string OpRegistrationBuilder::name() const { return name_; }
-
-OpRegistrationBuilder &OpRegistrationBuilder::SetDevicePlaceFunc(
-    std::vector<DeviceType> (*func)()) {
-  info_.device_place_func_ = func;
-  return *this;
-}
-
-void OpRegistrationBuilder::Finalize(OpRegistrationInfo *info) const {
-  *info = info_;
-}
-
-void OpDefRegistryBase::AddRegistrar(const std::string name,
-                                    const OpRegistrar &registrar) {
-  registrar_.emplace(name, registrar);
-}
-
-MaceStatus OpDefRegistryBase::Register(const std::string &name) {
-  VLOG(3) << "Registering operation definition: " << name;
-  if (registry_.find(name) != registry_.end()) {
-    return MaceStatus::MACE_SUCCESS;
-  }
-  auto iter = registrar_.find(name);
-  if (iter == registrar_.end()) {
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "MACE do not support the operation: " + name);
-  }
-  registry_.emplace(
-      name, std::unique_ptr<OpRegistrationInfo>(new OpRegistrationInfo()));
-  iter->second(registry_[name].get());
-  return MaceStatus::MACE_SUCCESS;
-}
-
-MaceStatus OpDefRegistryBase::Find(const std::string &name,
-                                  const OpRegistrationInfo **info) {
-  auto iter = registry_.find(name);
-  if (iter == registry_.end()) {
-    *info = nullptr;
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "Mace do not support the operation: " + name);
-  }
-  *info = iter->second.get();
-  return MaceStatus::MACE_SUCCESS;
-}
-
-}  // namespace mace
diff --git a/mace/core/op_def_registry.h b/mace/core/op_def_registry.h
deleted file mode 100644
index 8e015658..00000000
--- a/mace/core/op_def_registry.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_OP_DEF_REGISTRY_H_
-#define MACE_CORE_OP_DEF_REGISTRY_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "mace/proto/mace.pb.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
-
-namespace mace {
-
-// Device placement function
-typedef std::function<std::vector<DeviceType>()> DevicePlaceFunc;
-
-struct OpRegistrationInfo {
-  OpRegistrationInfo() = default;
-  explicit OpRegistrationInfo(const DevicePlaceFunc &func)
-      : device_place_func_(func) {}
-
-  DevicePlaceFunc device_place_func_;
-};
-
-class OpRegistrationBuilder {
- public:
-  explicit OpRegistrationBuilder(const std::string name);
-
-  const std::string name() const;
-
-  OpRegistrationBuilder &SetDevicePlaceFunc(
-      std::vector<DeviceType> (*func)());
-
-  void Finalize(OpRegistrationInfo *info) const;
- private:
-  std::string name_;
-  OpRegistrationInfo info_;
-};
-
-class OpDefRegistryBase {
- public:
-  typedef std::function<void(OpRegistrationInfo *)> OpRegistrar;
-  OpDefRegistryBase() = default;
-  virtual ~OpDefRegistryBase() = default;
-  void AddRegistrar(const std::string name, const OpRegistrar &registrar);
-  MaceStatus Register(const std::string &name);
-  MaceStatus Find(const std::string &name, const OpRegistrationInfo **info);
-
- private:
-  std::unordered_map<std::string, OpRegistrar> registrar_;
-  std::unordered_map<
-      std::string,
-      std::unique_ptr<OpRegistrationInfo>> registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OpDefRegistryBase);
-};
-
-void AddOpRegistrar(OpDefRegistryBase *registry,
-                    const OpRegistrationBuilder &builder);
-
-#define MACE_REGISTER_OP_DEF(op_def_registry, builder) \
-  AddOpRegistrar(op_def_registry, builder)
-
-}  // namespace mace
-
-#endif  // MACE_CORE_OP_DEF_REGISTRY_H_
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index d29c84e3..9a1da4c8 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sstream>
+#include <map>
 #include <memory>
 #include <vector>
 
@@ -79,7 +80,26 @@ MaceStatus Operation::Init(OpInitContext *context) {
   return MaceStatus::MACE_SUCCESS;
 }
 
-OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
+// op registry
+namespace {
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const std::string &op_name);
+
+  OpKeyBuilder &Device(DeviceType device);
+
+  OpKeyBuilder &TypeConstraint(const char *attr_name,
+                               DataType allowed);
+
+  const std::string Build();
+
+ private:
+  std::string op_name_;
+  DeviceType device_type_;
+  std::map<std::string, DataType> type_constraint_;
+};
+
+OpKeyBuilder::OpKeyBuilder(const std::string &op_name) : op_name_(op_name) {}
 
 OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
   device_type_ = device;
@@ -103,16 +123,53 @@ const std::string OpKeyBuilder::Build() {
 
   return ss.str();
 }
+}  // namespace
+
+void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
+  devices.insert(device);
+}
+
+void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
+  VLOG(3) << "Registering: " << key;
+  MACE_CHECK(creators.count(key) == 0, "Key already registered: ", key);
+  creators[key] = creator;
+}
+
+MaceStatus OpRegistryBase::Register(const std::string &op_type,
+                                const mace::DeviceType device_type,
+                                const mace::DataType dt,
+                                mace::OpRegistrationInfo::OpCreator creator) {
+  if (registry_.count(op_type) == 0) {
+    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
+        new OpRegistrationInfo);
+  }
+  registry_[op_type]->AddDevice(device_type);
+
+  std::string op_key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dt)
+      .Build();
+  registry_.at(op_type)->Register(op_key, creator);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+const std::set<DeviceType> OpRegistryBase::AvailableDevices(
+    const std::string &op_type) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  return registry_.at(op_type)->devices;
+}
 
-OpRegistryBase::~OpRegistryBase() = default;
 
 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
     OpConstructContext *context,
     DeviceType device_type,
     const NetMode mode) const {
   OperatorDef *operator_def = context->operator_def();
-  const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *operator_def, "T", static_cast<int>(DT_FLOAT));
+  const DataType dtype = static_cast<DataType>(
+      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          *operator_def, "T", static_cast<int>(DT_FLOAT)));
   const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
       *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
   const NetMode op_mode = static_cast<NetMode>(op_mode_i);
@@ -120,15 +177,20 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
           << operator_def->type() << "<" << dtype << ">" << ") on "
           << device_type;
   if (op_mode == mode) {
-    return registry_.Create(
-        OpKeyBuilder(operator_def->type().data())
-            .Device(device_type)
-            .TypeConstraint("T", static_cast<DataType>(dtype))
-            .Build(),
-        context);
+    const std::string op_type = context->operator_def()->type();
+    MACE_CHECK(registry_.count(op_type) != 0,
+               op_type, " operation is not registered.");
+
+    std::string key = OpKeyBuilder(op_type)
+        .Device(device_type)
+        .TypeConstraint("T", dtype)
+        .Build();
+    if (registry_.at(op_type)->creators.count(key) == 0) {
+      LOG(FATAL) << "Key not registered: " << key;
+    }
+    return registry_.at(op_type)->creators.at(key)(context);
   } else {
     return nullptr;
   }
 }
-
 }  // namespace mace
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 34de7e72..c354afbd 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -16,13 +16,13 @@
 #define MACE_CORE_OPERATOR_H_
 
 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
-#include <map>
 
 #include "mace/core/arg_helper.h"
 #include "mace/core/op_context.h"
-#include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
@@ -160,62 +160,57 @@ class Operation {
 #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
   enum _OutputTags { first_input = 0, __VA_ARGS__ }
 
-class OpKeyBuilder {
- public:
-  explicit OpKeyBuilder(const char *op_name);
 
-  OpKeyBuilder &Device(DeviceType device);
+struct OpRegistrationInfo {
+ public:
+  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
+      OpCreator;
 
-  OpKeyBuilder &TypeConstraint(const char *attr_name,
-                               DataType allowed);
+  OpRegistrationInfo() = default;
 
-  template <typename T>
-  OpKeyBuilder &TypeConstraint(const char *attr_name);
+  void AddDevice(DeviceType);
 
-  const std::string Build();
+  void Register(const std::string &key, OpCreator creator);
 
- private:
-  std::string op_name_;
-  DeviceType device_type_;
-  std::map<std::string, DataType> type_constraint_;
+  std::set<DeviceType> devices;
+  std::unordered_map<std::string, OpCreator> creators;
 };
 
-template <typename T>
-OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
-  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
-}
-
 class OpRegistryBase {
  public:
-  typedef Registry<std::string,
-                   Operation,
-                   OpConstructContext *>
-      RegistryType;
   OpRegistryBase() = default;
-  virtual ~OpRegistryBase();
-  RegistryType *registry() { return &registry_; }
+  virtual ~OpRegistryBase() = default;
+  MaceStatus Register(const std::string &op_type,
+                      const DeviceType device_type,
+                      const DataType dt,
+                      OpRegistrationInfo::OpCreator creator);
+
+  const std::set<DeviceType> AvailableDevices(
+      const std::string &op_type) const;
+
   std::unique_ptr<Operation> CreateOperation(
       OpConstructContext *context,
       DeviceType device_type,
       const NetMode mode) const;
 
+  template <class DerivedType>
+  static std::unique_ptr<Operation> DefaultCreator(
+      OpConstructContext *context) {
+    return std::unique_ptr<Operation>(new DerivedType(context));
+  }
+
  private:
-  RegistryType registry_;
+  std::unordered_map<
+      std::string,
+      std::unique_ptr<OpRegistrationInfo>> registry_;
   MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
 };
 
-MACE_DECLARE_REGISTRY(OpRegistry,
-                      Operation,
-                      OpConstructContext *);
-
 #define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
-  MACE_REGISTER_CLASS(OpRegistry,                                      \
-                      op_registry->registry(),                         \
-                      OpKeyBuilder(op_type)                            \
-                        .Device(device)                                \
-                        .TypeConstraint<dt>("T")                       \
-                        .Build(),                                      \
-                      class_name<device, dt>)
+  op_registry->Register(op_type,                                       \
+                        device,                                        \
+                        DataTypeToEnum<dt>::value,                     \
+                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)
 
 }  // namespace mace
 
diff --git a/mace/core/registry.h b/mace/core/registry.h
deleted file mode 100644
index 1ad92f0a..00000000
--- a/mace/core/registry.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_REGISTRY_H_
-#define MACE_CORE_REGISTRY_H_
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT(build/c++11)
-#include <string>
-#include <vector>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-template <class SrcType, class ObjectType, class... Args>
-class Registry {
- public:
-  typedef std::function<std::unique_ptr<ObjectType>(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType &key, Creator creator) {
-    VLOG(3) << "Registering: " << key;
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key);
-    registry_[key] = creator;
-  }
-
-  std::unique_ptr<ObjectType> Create(const SrcType &key, Args... args) const {
-    if (registry_.count(key) == 0) {
-      LOG(FATAL) << "Key not registered: " << key;
-    }
-    return registry_.at(key)(args...);
-  }
-
- private:
-  std::map<SrcType, Creator> registry_;
-  std::mutex register_mutex_;
-
-  MACE_DISABLE_COPY_AND_ASSIGN(Registry);
-};
-
-template <class SrcType, class ObjectType, class... Args>
-class Registerer {
- public:
-  Registerer(const SrcType &key,
-             Registry<SrcType, ObjectType, Args...> *registry,
-             typename Registry<SrcType, ObjectType, Args...>::Creator creator) {
-    registry->Register(key, creator);
-  }
-
-  template <class DerivedType>
-  static std::unique_ptr<ObjectType> DefaultCreator(Args... args) {
-    return std::unique_ptr<ObjectType>(new DerivedType(args...));
-  }
-};
-
-#define MACE_CONCATENATE_IMPL(s1, s2) s1##s2
-#define MACE_CONCATENATE(s1, s2) MACE_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define MACE_ANONYMOUS_VARIABLE(str) MACE_CONCATENATE(str, __COUNTER__)
-#else
-#define MACE_ANONYMOUS_VARIABLE(str) MACE_CONCATENATE(str, __LINE__)
-#endif
-
-#define MACE_DECLARE_TYPED_REGISTRY(RegistryName, SrcType, ObjectType, ...) \
-  typedef Registerer<SrcType, ObjectType, ##__VA_ARGS__>                    \
-      Registerer##RegistryName;
-
-#define MACE_DECLARE_REGISTRY(RegistryName, ObjectType, ...)         \
-  MACE_DECLARE_TYPED_REGISTRY(RegistryName, std::string, ObjectType, \
-                              ##__VA_ARGS__)
-
-#define MACE_REGISTER_TYPED_CLASS(RegistryName, registry, key, ...) \
-  Registerer##RegistryName MACE_ANONYMOUS_VARIABLE(RegistryName)( \
-      key, registry, Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
-
-#define MACE_REGISTER_CLASS(RegistryName, registry, key, ...) \
-  MACE_REGISTER_TYPED_CLASS(RegistryName, registry, key, __VA_ARGS__)
-
-}  // namespace mace
-
-#endif  // MACE_CORE_REGISTRY_H_
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 4c03f331..7cf01043 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -15,10 +15,11 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_
 
+#include <algorithm>
+#include <functional>
+#include <numeric>
 #include <string>
 #include <vector>
-#include <functional>
-#include <algorithm>
 
 #include "mace/core/buffer.h"
 #include "mace/core/preallocated_pooled_allocator.h"
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
deleted file mode 100644
index 88528578..00000000
--- a/mace/kernels/BUILD
+++ /dev/null
@@ -1,150 +0,0 @@
-# Description:
-# Mace neon kernels.
-#
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//mace:mace.bzl",
-    "if_android",
-    "if_neon_enabled",
-    "if_openmp_enabled",
-    "if_android_armv7",
-    "if_hexagon_enabled",
-    "if_opencl_enabled",
-)
-
-cc_library(
-    name = "kernels",
-    srcs = glob(
-        [
-            "*.cc",
-            "arm/*.cc",
-        ],
-        exclude = [
-            "*_test.cc",
-            "*_benchmark.cc",
-            "arm/*_test.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
-            "lstm_cell.cc",
-            "winograd_transform.cc",
-        ],
-    ) + if_opencl_enabled(glob(
-        [
-            "opencl/*.cc",
-            "opencl/image/*.cc",
-            "opencl/buffer/*.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
-            "lstm_cell.cc",
-            "winograd_transform.cc",
-        ],
-        exclude = [
-            "opencl/*_test.cc",
-        ],
-    )),
-    hdrs = glob(
-        [
-            "*.h",
-            "arm/*.h",
-        ],
-    ) + if_opencl_enabled(glob([
-        "opencl/*.h",
-        "opencl/image/*.h",
-        "opencl/buffer/*.h",
-    ])),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-    ]) + if_android_armv7([
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = if_android(["-lm"]),
-    deps = [
-        "//mace/core",
-        "@gemmlowp",
-        "@tflite",
-    ],
-)
-
-cc_test(
-    name = "kernels_test",
-    testonly = 1,
-    srcs = glob(
-        [
-            "*_test.cc",
-            "arm/*_test.cc",
-            "opencl/*_test.cc",
-        ],
-    ),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = ["-fopenmp"],
-    linkstatic = 1,
-    deps = [
-        ":kernels",
-        "//mace/ops",
-        "@gtest",
-        "@gtest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "kernels_benchmark",
-    testonly = 1,
-    srcs = glob(["*_benchmark.cc"]),
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
-    linkopts = ["-fopenmp"],
-    linkstatic = 1,
-    deps = [
-        ":kernels",
-        "//mace/core:test_benchmark_main",
-        "//mace/ops",
-        "//third_party/eigen3",
-        "@gemmlowp",
-    ],
-)
diff --git a/mace/kernels/matmul_benchmark.cc b/mace/kernels/matmul_benchmark.cc
deleted file mode 100644
index ef19bd6c..00000000
--- a/mace/kernels/matmul_benchmark.cc
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Eigen/Dense>
-#include <algorithm>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "public/gemmlowp.h"
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/sgemm.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace gemmlowp {
-
-template<typename tScalar, MapOrder tOrder>
-class Matrix : public MatrixMap<tScalar, tOrder> {
- public:
-  typedef MatrixMap<tScalar, tOrder> Map;
-  typedef MatrixMap<const tScalar, tOrder> ConstMap;
-  typedef typename Map::Scalar Scalar;
-  static const MapOrder Order = tOrder;
-  using Map::cols_;
-  using Map::data_;
-  using Map::kOrder;
-  using Map::rows_;
-  using Map::stride_;
-
- public:
-  Matrix() : Map(nullptr, 0, 0, 0) {}
-
-  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
-
-  Matrix(const Matrix &other) : Map(nullptr, 0, 0, 0) { *this = other; }
-
-  Matrix &operator=(const Matrix &other) {
-    Resize(other.rows_, other.cols_);
-    std::memcpy(data_, other.data_, size() * sizeof(Scalar));
-    return *this;
-  }
-
-  friend bool operator==(const Matrix &a, const Matrix &b) {
-    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
-        !std::memcmp(a.data_, b.data_, a.size());
-  }
-
-  void Resize(int rows, int cols) {
-    rows_ = rows;
-    cols_ = cols;
-    stride_ = kOrder == gemmlowp::MapOrder::ColMajor ? rows : cols;
-    storage.resize(size());
-    data_ = storage.data();
-  }
-
-  int size() const { return rows_ * cols_; }
-
-  Map &map() { return *static_cast<Map *>(this); }
-
-  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
-
- protected:
-  std::vector<Scalar> storage;
-};
-
-template<typename MatrixType>
-void MakeZero(MatrixType *m) {
-  for (int c = 0; c < m->cols(); c++) {
-    for (int r = 0; r < m->rows(); r++) {
-      (*m)(r, c) = 128;
-    }
-  }
-}
-
-}  // namespace gemmlowp
-
-namespace mace {
-namespace kernels {
-namespace test {
-
-// Test the speed of different access order of a NHWC buffer
-
-namespace {
-
-// Matmul with (m, k) x (k, n)
-void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-  // warm up
-  Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  mace::testing::StartTiming();
-  while (iters--) {
-    Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  }
-}
-
-void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-
-  kernels::MatrixMap<const float> matrix_lhs(1, m, k, RowMajor, lhs.data(),
-                                             true);
-  kernels::MatrixMap<const float> matrix_rhs(1, k, n, RowMajor, rhs.data(),
-                                             true);
-  kernels::MatrixMap<float> matrix_result(1, m, n, RowMajor, result.data());
-
-  kernels::SGemm sgemm;
-
-  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-  }
-}
-
-void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
-  Eigen::MatrixXf rhs = Eigen::MatrixXf::Random(k, n);
-  Eigen::MatrixXf result = Eigen::MatrixXf::Zero(m, n);
-  // warm up
-  result = lhs * rhs;
-  mace::testing::StartTiming();
-  while (iters--) {
-    result = lhs * rhs;
-  }
-}
-
-void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
-  mace::testing::StopTiming();
-
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> result;
-  lhs.Resize(rows, depth);
-  rhs.Resize(depth, cols);
-  result.Resize(rows, cols);
-  gemmlowp::MakeZero(&lhs);
-  gemmlowp::MakeZero(&rhs);
-  gemmlowp::MakeZero(&result);
-
-  gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint quantize_down_stage;
-  quantize_down_stage.result_offset_after_shift = 128;
-  quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
-  quantize_down_stage.result_shift = 16;
-  gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
-  const auto output_pipeline =
-      std::make_tuple(quantize_down_stage, saturating_cast_stage);
-
-  auto gemm_context =
-      mace::ops::test::OpTestContext::Get()
-          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
-  MACE_CHECK_NOTNULL(gemm_context);
-
-  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-
-  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
-      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-      -128, output_pipeline);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
-                                     BitDepthParams>(
-        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-        -128, output_pipeline);
-  }
-}
-
-void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
-  mace::testing::StopTiming();
-
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
-  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
-  gemmlowp::Matrix<std::int32_t, gemmlowp::MapOrder::ColMajor> result;
-  lhs.Resize(rows, depth);
-  rhs.Resize(depth, cols);
-  result.Resize(rows, cols);
-  gemmlowp::MakeZero(&lhs);
-  gemmlowp::MakeZero(&rhs);
-  gemmlowp::MakeZero(&result);
-
-  const auto output_pipeline = std::make_tuple();
-
-  auto gemm_context =
-      mace::ops::test::OpTestContext::Get()
-          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
-  MACE_CHECK_NOTNULL(gemm_context);
-
-  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-
-  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
-      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-      -128, output_pipeline);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
-                                     BitDepthParams>(
-        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-        -128, output_pipeline);
-  }
-}
-
-}  // namespace
-
-#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
-  static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
-    const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
-    mace::testing::MaccProcessed(macc);                            \
-    mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
-    MatmulBenchmark_##FUNC(iters, M, K, N);                        \
-  }                                                                \
-  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
-
-#define MACE_BM_MATMUL(M, K, N)                          \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace, float);             \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
-  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
-  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
-  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
-
-// Embedding size 384
-MACE_BM_MATMUL(7, 384, 384);
-MACE_BM_MATMUL(7, 384, 1536);
-MACE_BM_MATMUL(7, 1536, 384);
-
-MACE_BM_MATMUL(15, 384, 384);
-MACE_BM_MATMUL(15, 384, 1536);
-MACE_BM_MATMUL(15, 1536, 384);
-
-MACE_BM_MATMUL(1, 256, 256);
-MACE_BM_MATMUL(1, 256, 1536);
-MACE_BM_MATMUL(1, 1536, 256);
-MACE_BM_MATMUL(256, 256, 1);
-MACE_BM_MATMUL(1536, 256, 1);
-MACE_BM_MATMUL(256, 1536, 1);
-MACE_BM_MATMUL(29792, 256, 1);
-MACE_BM_MATMUL(1, 256, 29792);
-MACE_BM_MATMUL(2, 256, 256);
-MACE_BM_MATMUL(2, 256, 1536);
-MACE_BM_MATMUL(2, 1536, 256);
-MACE_BM_MATMUL(3, 256, 256);
-MACE_BM_MATMUL(3, 256, 1536);
-MACE_BM_MATMUL(3, 1536, 256);
-MACE_BM_MATMUL(4, 256, 256);
-MACE_BM_MATMUL(4, 256, 1536);
-MACE_BM_MATMUL(4, 1536, 256);
-MACE_BM_MATMUL(8, 256, 256);
-MACE_BM_MATMUL(8, 256, 1536);
-MACE_BM_MATMUL(8, 1536, 256);
-MACE_BM_MATMUL(10, 256, 256);
-MACE_BM_MATMUL(10, 256, 1536);
-MACE_BM_MATMUL(10, 1536, 256);
-MACE_BM_MATMUL(15, 256, 256);
-MACE_BM_MATMUL(15, 256, 1536);
-MACE_BM_MATMUL(15, 1536, 256);
-
-// Embedding size 128
-MACE_BM_MATMUL(1, 128, 1536);
-MACE_BM_MATMUL(1, 128, 44678);
-
-// MobileNet
-MACE_BM_MATMUL(128, 128, 3136);
-MACE_BM_MATMUL(256, 256, 784);
-MACE_BM_MATMUL(512, 512, 196);
-MACE_BM_MATMUL(1024, 1024, 49);
-
-}  // namespace test
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/libmace/BUILD b/mace/libmace/BUILD
index 4e887713..d227f259 100644
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -40,7 +40,6 @@ cc_library(
     deps = [
         "//mace/public",
         "//mace/ops",
-        "//mace/kernels",
     ],
     alwayslink = 1,
 )
@@ -79,7 +78,7 @@ genrule(
     srcs = [
         "//mace/codegen:generated_version",
         "//mace/core",
-        "//mace/kernels",
+        "//mace/ops:internal_ops",
         "//mace/ops",
         "//mace/libmace",
         "//mace/utils",
@@ -93,7 +92,7 @@ genrule(
           "mri_stream=$$(python $(location //mace/python/tools:archive_static_lib) " +
           "$(locations //mace/codegen:generated_version) " +
           "$(locations //mace/core:core) " +
-          "$(locations //mace/kernels:kernels) " +
+          "$(locations //mace/ops:internal_ops) " +
           "$(locations //mace/ops:ops) " +
           "$(locations //mace/libmace:libmace) " +
           "$(locations //mace/utils:utils) " +
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index a7494086..c4d65f7b 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -22,8 +22,7 @@
 
 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
-#include "mace/kernels/ops_register.h"
-#include "mace/ops/ops_def_register.h"
+#include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -359,7 +358,6 @@ class MaceEngine::Impl {
  private:
   const unsigned char *model_data_;
   size_t model_data_size_;
-  std::unique_ptr<OpDefRegistryBase> op_def_registry_;
   std::unique_ptr<OpRegistryBase> op_registry_;
   DeviceType device_type_;
   std::unique_ptr<Device> device_;
@@ -377,7 +375,6 @@ class MaceEngine::Impl {
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
       model_data_size_(0),
-      op_def_registry_(new OpDefRegistry()),
       op_registry_(new OpRegistry),
       device_type_(config.impl_->device_type()),
       device_(nullptr),
@@ -466,7 +463,6 @@ MaceStatus MaceEngine::Impl::Init(
 
     // Init model
     auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_def_registry_.get(),
         op_registry_.get(),
         net_def,
         ws_.get(),
@@ -474,8 +470,7 @@ MaceStatus MaceEngine::Impl::Init(
         NetMode::INIT));
     MACE_RETURN_IF_ERROR(net->Init());
     MACE_RETURN_IF_ERROR(net->Run());
-    net_ = std::unique_ptr<NetBase>(new SerialNet(op_def_registry_.get(),
-                                                  op_registry_.get(),
+    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
                                                   net_def,
                                                   ws_.get(),
                                                   device_.get()));
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index d039f8c8..bfdf85a5 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -1,5 +1,4 @@
 # Description:
-# Mace operators.
 #
 package(
     default_visibility = ["//visibility:public"],
@@ -18,18 +17,58 @@ load(
 )
 
 cc_library(
-    name = "test",
-    testonly = 1,
-    hdrs = glob([
-        "*_test_util.h",
-    ]),
-    srcs = [
-        "ops_test_util.cc",
-    ],
+    name = "internal_ops",
+    srcs = glob(
+        [
+            "*.cc",
+            "arm/*.cc",
+        ],
+        exclude = [
+            "*_test.cc",
+            "*_benchmark.cc",
+            "arm/*_test.cc",
+            "ops_registry.cc",
+            "ops_test_util.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
+        ],
+    ) + if_opencl_enabled(glob(
+        [
+            "opencl/*.cc",
+            "opencl/image/*.cc",
+            "opencl/buffer/*.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
+        ],
+        exclude = [
+            "opencl/*_test.cc",
+        ],
+    )),
+    hdrs = glob(
+        [
+            "*.h",
+            "arm/*.h",
+        ],
+        exclude = [
+            "ops_registry.h",
+            "ops_test_util.h",
+        ]
+    ) + if_opencl_enabled(glob([
+        "opencl/*.h",
+        "opencl/image/*.h",
+        "opencl/buffer/*.h",
+    ])),
     copts = [
         "-Werror",
         "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
         "-DMACE_ENABLE_NEON",
     ]) + if_android_armv7([
         "-mfpu=neon",
@@ -40,20 +79,54 @@ cc_library(
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
     ]),
+    linkopts = if_android(["-lm"]),
     deps = [
-        "ops",
-        "//mace/kernels",
-        "@gtest",
+        "//mace/core",
+        "@gemmlowp",
+        "@tflite",
     ],
 )
 
+
 cc_library(
     name = "ops",
     srcs = [
-        "ops_def_register.cc",
+        "ops_registry.cc"
     ],
     hdrs = [
-        "ops_def_register.h",
+        "ops_registry.h",
+    ],
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    linkopts = if_android(["-lm"]),
+    deps = [
+        "internal_ops",
+    ],
+)
+
+cc_library(
+    name = "test",
+    testonly = 1,
+    hdrs = glob([
+        "*_test_util.h",
+    ]),
+    srcs = [
+        "ops_test_util.cc",
     ],
     copts = [
         "-Werror",
@@ -70,7 +143,8 @@ cc_library(
         "-DMACE_ENABLE_HEXAGON",
     ]),
     deps = [
-        "//mace/core",
+        "ops",
+        "@gtest",
     ],
 )
 
@@ -78,16 +152,22 @@ cc_test(
     name = "ops_test",
     testonly = 1,
     srcs = glob(
-        ["*_test.cc"],
+        [
+            "*_test.cc",
+            "arm/*_test.cc",
+            "opencl/*_test.cc",
+        ],
     ),
     copts = [
         "-Werror",
         "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
         "-DMACE_ENABLE_NEON",
     ]) + if_android_armv7([
         "-mfpu=neon",
-    ]) + if_android_armv7([
         "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
@@ -97,8 +177,7 @@ cc_test(
     linkopts = ["-fopenmp"],
     linkstatic = 1,
     deps = [
-        ":ops",
-        ":test",
+        "test",
         "@gtest//:gtest_main",
     ],
 )
@@ -110,11 +189,13 @@ cc_test(
     copts = [
         "-Werror",
         "-Wextra",
-    ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
         "-DMACE_ENABLE_NEON",
     ]) + if_android_armv7([
         "-mfpu=neon",
-    ]) + if_android_armv7([
         "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
@@ -124,8 +205,9 @@ cc_test(
     linkopts = ["-fopenmp"],
     linkstatic = 1,
     deps = [
-        ":ops",
-        ":test",
+        "test",
         "//mace/core:test_benchmark_main",
+        "//third_party/eigen3",
+        "@gemmlowp",
     ],
 )
diff --git a/mace/kernels/activation.cc b/mace/ops/activation.cc
similarity index 93%
rename from mace/kernels/activation.cc
rename to mace/ops/activation.cc
index 038c4549..7c733177 100644
--- a/mace/kernels/activation.cc
+++ b/mace/ops/activation.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 
 #include <memory>
 
 #include "mace/core/operator.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/activation.h"
+#include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class ActivationOp;
@@ -33,7 +33,7 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit ActivationOp(OpConstructContext *context)
       : Operation(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
@@ -74,7 +74,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit ActivationOp(OpConstructContext *context)
       : Operation(context) {
-    ActivationType type = kernels::StringToActivationType(
+    ActivationType type = ops::StringToActivationType(
         Operation::GetOptionalArg<std::string>("activation",
                                               "NOOP"));
     auto relux_max_limit = static_cast<T>(
@@ -114,5 +114,5 @@ void RegisterActivation(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/activation.h b/mace/ops/activation.h
similarity index 95%
rename from mace/kernels/activation.h
rename to mace/ops/activation.h
index 12728465..2c9a1861 100644
--- a/mace/kernels/activation.h
+++ b/mace/ops/activation.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ACTIVATION_H_
-#define MACE_KERNELS_ACTIVATION_H_
+#ifndef MACE_OPS_ACTIVATION_H_
+#define MACE_OPS_ACTIVATION_H_
 
 #include <algorithm>
 #include <cmath>
 #include <string>
 
 #include "mace/core/types.h"
-#include "mace/kernels/arm/activation_neon.h"
+#include "mace/ops/arm/activation_neon.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 enum ActivationType {
   NOOP = 0,
@@ -149,7 +149,7 @@ void PReLUActivation(const T *input_ptr,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ACTIVATION_H_
+#endif  // MACE_OPS_ACTIVATION_H_
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 1f16879f..bd766047 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -14,7 +14,6 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -46,7 +45,7 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
         .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluBM")
         .Input("InputImage")
@@ -108,7 +107,7 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxBM")
         .Input("InputImage")
@@ -186,9 +185,9 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
         .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Activation", "PreluBM")
         .Input("InputImage")
@@ -251,7 +250,7 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "TanhBM")
         .Input("InputImage")
@@ -318,7 +317,7 @@ void SigmoidBenchmark(
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "SigmoidBM")
         .Input("InputImage")
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index 01735e97..f56a3a17 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -33,7 +32,7 @@ void TestSimpleRelu() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluTest")
         .Input("InputImage")
@@ -46,7 +45,7 @@ void TestSimpleRelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluTest")
         .Input("Input")
@@ -81,7 +80,7 @@ void TestUnalignedSimpleRelu() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluTest")
         .Input("InputImage")
@@ -94,7 +93,7 @@ void TestUnalignedSimpleRelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluTest")
         .Input("Input")
@@ -132,7 +131,7 @@ void TestSimpleRelux() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxTest")
         .Input("InputImage")
@@ -146,7 +145,7 @@ void TestSimpleRelux() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluxTest")
         .Input("Input")
@@ -182,7 +181,7 @@ void TestSimpleReluRelux() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxTest")
         .Input("InputImage")
@@ -196,7 +195,7 @@ void TestSimpleReluRelux() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluxTest")
         .Input("Input")
@@ -237,9 +236,9 @@ void TestSimplePrelu() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Activation", "PreluTest")
         .Input("InputImage")
@@ -253,7 +252,7 @@ void TestSimplePrelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "PreluTest")
         .Input("Input")
@@ -293,7 +292,7 @@ void TestSimpleTanh() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "TanhTest")
         .Input("InputImage")
@@ -306,7 +305,7 @@ void TestSimpleTanh() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "TanhTest")
         .Input("Input")
@@ -346,7 +345,7 @@ void TestSimpleSigmoid() {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "SigmoidTest")
         .Input("InputImage")
@@ -359,7 +358,7 @@ void TestSimpleSigmoid() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "SigmoidTest")
         .Input("Input")
diff --git a/mace/kernels/addn.cc b/mace/ops/addn.cc
similarity index 98%
rename from mace/kernels/addn.cc
rename to mace/ops/addn.cc
index 6634e8e8..4040de1f 100644
--- a/mace/kernels/addn.cc
+++ b/mace/ops/addn.cc
@@ -22,11 +22,11 @@
 #include "mace/core/operator.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/addn.h"
+#include "mace/ops/opencl/image/addn.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 static constexpr int kCostPerGroup = 1024;
 
@@ -142,5 +142,5 @@ void RegisterAddN(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index a155d854..5db2bda4 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -14,7 +14,6 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -37,7 +36,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
     for (int i = 0; i < inputs; ++i) {
       BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
                           MakeString("InputImage", i).c_str(),
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
     }
     OpDefBuilder op_def_builder("AddN", "AddNBM");
     for (int i = 0; i < inputs; ++i) {
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 5116e36b..865fdd7f 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -69,7 +68,7 @@ void SimpleAdd3() {
     for (int i = 0; i < input_num; ++i) {
       BufferToImage<D, half>(&net, MakeString("Input", i),
                              MakeString("InputImage", i),
-                             kernels::BufferType::IN_OUT_CHANNEL);
+                             ops::BufferType::IN_OUT_CHANNEL);
     }
 
     auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
@@ -84,7 +83,7 @@ void SimpleAdd3() {
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("AddN", "AddNTest")
         .Input("Input0")
@@ -143,7 +142,7 @@ void RandomTest() {
     for (int i = 0; i < input_num; ++i) {
       BufferToImage<D, half>(&net, MakeString("Input", i),
                              MakeString("InputImage", i),
-                             kernels::BufferType::IN_OUT_CHANNEL);
+                             ops::BufferType::IN_OUT_CHANNEL);
     }
 
     auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
@@ -158,7 +157,7 @@ void RandomTest() {
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-2);
diff --git a/mace/kernels/argmax.cc b/mace/ops/argmax.cc
similarity index 98%
rename from mace/kernels/argmax.cc
rename to mace/ops/argmax.cc
index 19d52f7f..8f8419b7 100644
--- a/mace/kernels/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -21,7 +21,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class ArgMaxOp : public Operation {
@@ -84,5 +84,5 @@ void RegisterArgMax(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc
index 06de7046..a0001ec3 100644
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc
similarity index 95%
rename from mace/kernels/arm/activation_neon.cc
rename to mace/ops/arm/activation_neon.cc
index 6067077c..44b492a4 100644
--- a/mace/kernels/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
@@ -17,10 +17,10 @@
 #endif
 
 #include <algorithm>
-#include "mace/kernels/arm/activation_neon.h"
+#include "mace/ops/arm/activation_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void ReluNeon(const float *input, const index_t size, float *output) {
 #if defined(MACE_ENABLE_NEON)
@@ -67,5 +67,5 @@ void ReluxNeon(const float *input, const float limit,
 #endif
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/activation_neon.h b/mace/ops/arm/activation_neon.h
similarity index 82%
rename from mace/kernels/arm/activation_neon.h
rename to mace/ops/arm/activation_neon.h
index 886c95fe..cbd1974f 100644
--- a/mace/kernels/arm/activation_neon.h
+++ b/mace/ops/arm/activation_neon.h
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_
-#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+#ifndef MACE_OPS_ARM_ACTIVATION_NEON_H_
+#define MACE_OPS_ARM_ACTIVATION_NEON_H_
 
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void ReluNeon(const float *input, const index_t size, float *output);
 
 void ReluxNeon(const float *input, const float limit,
                const index_t size, float *output);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+#endif  // MACE_OPS_ARM_ACTIVATION_NEON_H_
diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/ops/arm/conv_2d_neon.h
similarity index 95%
rename from mace/kernels/arm/conv_2d_neon.h
rename to mace/ops/arm/conv_2d_neon.h
index bf0e1023..bd2307e2 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/ops/arm/conv_2d_neon.h
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARM_CONV_2D_NEON_H_
-#define MACE_KERNELS_ARM_CONV_2D_NEON_H_
+#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
+#define MACE_OPS_ARM_CONV_2D_NEON_H_
 
 #include "mace/core/types.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void Conv2dNeonK1x1S1(const float *input,
                       const float *filter,
@@ -115,7 +115,7 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ARM_CONV_2D_NEON_H_
+#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
diff --git a/mace/kernels/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc
similarity index 98%
rename from mace/kernels/arm/conv_2d_neon_15x1.cc
rename to mace/ops/arm/conv_2d_neon_15x1.cc
index 0facfc6e..a4bae4e9 100644
--- a/mace/kernels/arm/conv_2d_neon_15x1.cc
+++ b/mace/ops/arm/conv_2d_neon_15x1.cc
@@ -16,11 +16,11 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 inline void Conv2dCPUK15x1Calc(const float *in_ptr,
                                const float *filter_ptr,
@@ -157,5 +157,5 @@ void Conv2dNeonK15x1S1(const float *input,
   }        // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_1x1.cc b/mace/ops/arm/conv_2d_neon_1x1.cc
similarity index 94%
rename from mace/kernels/arm/conv_2d_neon_1x1.cc
rename to mace/ops/arm/conv_2d_neon_1x1.cc
index 21554d90..be5c6b53 100644
--- a/mace/kernels/arm/conv_2d_neon_1x1.cc
+++ b/mace/ops/arm/conv_2d_neon_1x1.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void Conv2dNeonK1x1S1(const float *input,
                       const float *filter,
@@ -44,5 +44,5 @@ void Conv2dNeonK1x1S1(const float *input,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc
similarity index 98%
rename from mace/kernels/arm/conv_2d_neon_1x15.cc
rename to mace/ops/arm/conv_2d_neon_1x15.cc
index 6fdc6ed8..06c40e29 100644
--- a/mace/kernels/arm/conv_2d_neon_1x15.cc
+++ b/mace/ops/arm/conv_2d_neon_1x15.cc
@@ -16,12 +16,12 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 inline void Conv2dCPUK1x15Calc(const float *in_ptr,
                                const float *filter_ptr,
@@ -143,5 +143,5 @@ void Conv2dNeonK1x15S1(const float *input,
   }        // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc
similarity index 99%
rename from mace/kernels/arm/conv_2d_neon_1x7.cc
rename to mace/ops/arm/conv_2d_neon_1x7.cc
index 8a7b1a41..39321e0f 100644
--- a/mace/kernels/arm/conv_2d_neon_1x7.cc
+++ b/mace/ops/arm/conv_2d_neon_1x7.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK1x7S1(const float *input,
@@ -247,5 +247,5 @@ void Conv2dNeonK1x7S1(const float *input,
   }      // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/conv_2d_neon_3x3.cc
similarity index 99%
rename from mace/kernels/arm/conv_2d_neon_3x3.cc
rename to mace/ops/arm/conv_2d_neon_3x3.cc
index 94551667..33653a42 100644
--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/conv_2d_neon_3x3.cc
@@ -17,10 +17,10 @@
 #endif
 
 #include "mace/core/macros.h"
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
@@ -658,5 +658,5 @@ void Conv2dNeonK3x3S2(const float *input,
   }      // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/conv_2d_neon_5x5.cc
similarity index 99%
rename from mace/kernels/arm/conv_2d_neon_5x5.cc
rename to mace/ops/arm/conv_2d_neon_5x5.cc
index a60bec41..7803a89e 100644
--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/conv_2d_neon_5x5.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
   /* load filter (4 outch x 1 height x 4 width) */        \
@@ -215,5 +215,5 @@ void Conv2dNeonK5x5S1(const float *input,
   }          // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc
similarity index 99%
rename from mace/kernels/arm/conv_2d_neon_7x1.cc
rename to mace/ops/arm/conv_2d_neon_7x1.cc
index 97d1bec2..37d9ec9d 100644
--- a/mace/kernels/arm/conv_2d_neon_7x1.cc
+++ b/mace/ops/arm/conv_2d_neon_7x1.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 // Ho = 4, Wo = 1, Co = 4
 void Conv2dNeonK7x1S1(const float *input,
@@ -287,5 +287,5 @@ void Conv2dNeonK7x1S1(const float *input,
   }      // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/conv_2d_neon_7x7.cc
similarity index 99%
rename from mace/kernels/arm/conv_2d_neon_7x7.cc
rename to mace/ops/arm/conv_2d_neon_7x7.cc
index d824f881..4e1c0041 100644
--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/conv_2d_neon_7x7.cc
@@ -16,10 +16,10 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
   /* load filter (4 outch x 1 height x 4 width) */ \
@@ -638,5 +638,5 @@ void Conv2dNeonK7x7S3(const float *input,
   }          // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_winograd.cc b/mace/ops/arm/conv_winograd.cc
similarity index 99%
rename from mace/kernels/arm/conv_winograd.cc
rename to mace/ops/arm/conv_winograd.cc
index d115e4e5..2f6207fd 100644
--- a/mace/kernels/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
@@ -14,11 +14,11 @@
 
 #include <algorithm>
 
-#include "mace/kernels/arm/conv_winograd.h"
-#include "mace/kernels/gemm.h"
+#include "mace/ops/arm/conv_winograd.h"
+#include "mace/ops/gemm.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 namespace {
 // NCHW => NTCB (T: in tile pixels, B: tile indices)
@@ -747,5 +747,5 @@ void ConvRef3x3s1(const float *input,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_winograd.h b/mace/ops/arm/conv_winograd.h
similarity index 92%
rename from mace/kernels/arm/conv_winograd.h
rename to mace/ops/arm/conv_winograd.h
index 7e274b77..5e07db15 100644
--- a/mace/kernels/arm/conv_winograd.h
+++ b/mace/ops/arm/conv_winograd.h
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARM_CONV_WINOGRAD_H_
-#define MACE_KERNELS_ARM_CONV_WINOGRAD_H_
+#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_
+#define MACE_OPS_ARM_CONV_WINOGRAD_H_
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
 
 #include "mace/core/types.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void TransformFilter4x4(const float *filter,
                         const index_t in_channels,
@@ -70,7 +70,7 @@ void ConvRef3x3s1(const float *input,
                   const index_t out_channels,
                   float *output);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ARM_CONV_WINOGRAD_H_
+#endif  // MACE_OPS_ARM_CONV_WINOGRAD_H_
diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/ops/arm/conv_winograd_test.cc
similarity index 91%
rename from mace/kernels/arm/conv_winograd_test.cc
rename to mace/ops/arm/conv_winograd_test.cc
index ccb4f118..906dd3d9 100644
--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/ops/arm/conv_winograd_test.cc
@@ -19,10 +19,10 @@
 
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/kernels/arm/conv_winograd.h"
+#include "mace/ops/arm/conv_winograd.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 TEST(ConvWinogradTest, winograd) {
   index_t batch = 1;
@@ -62,11 +62,11 @@ TEST(ConvWinogradTest, winograd) {
     return std::max(-1.0f, std::min(1.0f, nd(gen)));
   });
 
-  kernels::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
+  ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
                         in_channels, out_channels, output_data_ref);
 
   SGemm sgemm;
-  kernels::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
+  ops::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
                              in_width, in_channels, out_channels, 6,
                              output_data, &sgemm, nullptr);
 
@@ -76,5 +76,5 @@ TEST(ConvWinogradTest, winograd) {
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h
similarity index 93%
rename from mace/kernels/arm/deconv_2d_neon.h
rename to mace/ops/arm/deconv_2d_neon.h
index 1cddbf1a..d8abe427 100644
--- a/mace/kernels/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_
-#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+#ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_
+#define MACE_OPS_ARM_DECONV_2D_NEON_H_
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -22,7 +22,7 @@
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void Deconv2dNeonK3x3S1(const float *input,
                         const float *filter,
@@ -90,7 +90,7 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 }
 #endif
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+#endif  // MACE_OPS_ARM_DECONV_2D_NEON_H_
diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
similarity index 99%
rename from mace/kernels/arm/deconv_2d_neon_3x3.cc
rename to mace/ops/arm/deconv_2d_neon_3x3.cc
index cdba42c0..0495cf93 100644
--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "mace/core/macros.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void Deconv2dNeonK3x3S1(const float *input,
                         const float *filter,
@@ -387,5 +387,5 @@ void Deconv2dNeonK3x3S2(const float *input,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
similarity index 99%
rename from mace/kernels/arm/deconv_2d_neon_4x4.cc
rename to mace/ops/arm/deconv_2d_neon_4x4.cc
index 575a8494..bddb56f5 100644
--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "mace/core/macros.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void Deconv2dNeonK4x4S1(const float *input,
                         const float *filter,
@@ -501,5 +501,5 @@ void Deconv2dNeonK4x4S2(const float *input,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h
similarity index 90%
rename from mace/kernels/arm/depthwise_conv2d_neon.h
rename to mace/ops/arm/depthwise_conv2d_neon.h
index ec3fb360..cd475573 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/ops/arm/depthwise_conv2d_neon.h
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
-#define MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#ifndef MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#define MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_
 
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const float *filter,
@@ -42,7 +42,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
                                const index_t valid_w_stop,
                                float *output);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#endif  // MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_
diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
similarity index 99%
rename from mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
rename to mace/ops/arm/depthwise_conv2d_neon_3x3.cc
index 3a4491fb..2e997912 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
@@ -17,10 +17,10 @@
 #endif
 
 #include "mace/core/macros.h"
-#include "mace/kernels/arm/depthwise_conv2d_neon.h"
+#include "mace/ops/arm/depthwise_conv2d_neon.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 namespace {
 void DepthwiseConv2dPixel(const float *in_base,
@@ -381,5 +381,5 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
   }    // b
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/batch_norm.cc b/mace/ops/batch_norm.cc
similarity index 96%
rename from mace/kernels/batch_norm.cc
rename to mace/ops/batch_norm.cc
index b07f2f43..07c00189 100644
--- a/mace/kernels/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -17,13 +17,13 @@
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/batch_norm.h"
+#include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class BatchNormOp;
@@ -35,7 +35,7 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
       : Operation(context),
         epsilon_(Operation::GetOptionalArg<float>("epsilon",
                                                  static_cast<float>(1e-4))),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
 
@@ -144,7 +144,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     float epsilon = Operation::GetOptionalArg<float>(
         "epsilon", static_cast<float>(1e-4));
-    ActivationType activation = kernels::StringToActivationType(
+    ActivationType activation = ops::StringToActivationType(
         Operation::GetOptionalArg<std::string>("activation", "NOOP"));
     float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
     if (context->device()->opencl_runtime()->UseImageMemory()) {
@@ -205,5 +205,5 @@ void RegisterBatchNorm(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index c390860e..814b631e 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -54,15 +53,15 @@ void BatchNorm(
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Var", "VarImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     OpDefBuilder("BatchNorm", "BatchNormBM")
         .Input("InputImage")
         .Input("ScaleImage")
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index 3c22d5ff..214fd507 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -51,15 +50,15 @@ void Simple() {
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Var", "VarImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("BatchNorm", "BatchNormTest")
         .Input("InputImage")
@@ -75,7 +74,7 @@ void Simple() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -135,15 +134,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputImage")
@@ -165,7 +164,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-5, 1e-4);
 }
@@ -214,15 +213,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputImage")
@@ -245,7 +244,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-1, 1e-2);
 }
@@ -294,15 +293,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputImage")
@@ -324,7 +323,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-5, 1e-4);
 }
@@ -373,15 +372,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputImage")
@@ -404,7 +403,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-1, 1e-2);
 }
diff --git a/mace/kernels/batch_to_space.cc b/mace/ops/batch_to_space.cc
similarity index 99%
rename from mace/kernels/batch_to_space.cc
rename to mace/ops/batch_to_space.cc
index 5df98aef..529a900b 100644
--- a/mace/kernels/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/batch_to_space.h"
+#include "mace/ops/opencl/image/batch_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class BatchToSpaceOpBase : public Operation {
  public:
@@ -303,5 +303,5 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 4cf55b33..7ea19f6b 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -42,7 +41,7 @@ void BMBatchToSpace(
         .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
         .Input("InputImage")
         .Output("OutputImage")
diff --git a/mace/kernels/bias_add.cc b/mace/ops/bias_add.cc
similarity index 97%
rename from mace/kernels/bias_add.cc
rename to mace/ops/bias_add.cc
index fc8b7374..9b528fa9 100644
--- a/mace/kernels/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -17,13 +17,13 @@
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/bias_add.h"
+#include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class BiasAddOp;
@@ -139,5 +139,5 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 5908caa2..dce361e9 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -47,9 +46,9 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("BiasAdd", "BiasAddBM")
         .Input("InputImage")
         .Input("BiasImage")
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index 771065c2..ba31ccec 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -46,9 +45,9 @@ void BiasAddSimple() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("BiasAdd", "BiasAddTest")
         .Input("InputImage")
@@ -60,7 +59,7 @@ void BiasAddSimple() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -116,9 +115,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BiasAdd", "BiasAddTest")
       .Input("InputImage")
@@ -131,7 +130,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 
@@ -172,9 +171,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BiasAdd", "BiasAddTest")
       .Input("InputImage")
@@ -187,7 +186,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
 
diff --git a/mace/kernels/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc
similarity index 85%
rename from mace/kernels/buffer_inverse_transform.cc
rename to mace/ops/buffer_inverse_transform.cc
index b447334c..8cfd72b5 100644
--- a/mace/kernels/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
@@ -15,11 +15,11 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
-#include "mace/kernels/opencl/image/image_to_buffer.h"
+#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
+#include "mace/ops/opencl/image/image_to_buffer.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class BufferInverseTransformOp;
@@ -41,9 +41,9 @@ class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
 
-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    ops::BufferType type =
+        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
 
     return kernel_->Compute(context, input, type,
                             wino_blk_size_, output);
@@ -63,5 +63,5 @@ void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
                    BufferInverseTransformOp, DeviceType::GPU, half);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index fb1cf51c..825ba105 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index 040e666e..fcf7e370 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -54,103 +54,103 @@ void TestBidirectionTransform(const int type,
 }  // namespace
 
 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
 }
 
 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
 }
 
 TEST(BufferToImageTest, ArgMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
 }
 
 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
 }
 
 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                    {1, 2, 3, 1});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                    {1, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                    {3, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                    {3, 13, 17, 128});
 }
 
 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
                                                    {3, 64, 64, 256});
 }
 
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {5, 3, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter1x1Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {13, 17, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {512, 128, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {3, 5, 3, 3});
 }
 
 TEST(BufferToImageTest, Filter3x3Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {17, 13, 3, 3});
 }
 
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
                                                    {256, 128, 3, 3});
 }
 
 TEST(BufferToImageTest, WeightWidthSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                    {1, 3, 3, 3});
 }
 
 TEST(BufferToImageTest, WeightWidthMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                    {11, 13, 13, 17});
 }
 
 TEST(BufferToImageTest, WeightWidthLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
                                                    {64, 64, 11, 13});
 }
 
 TEST(BufferToImageTest, WeightHeightSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                    {2, 1, 1, 1});
 }
 
 TEST(BufferToImageTest, WeightHeightMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                    {11, 13, 13, 17});
 }
 
 TEST(BufferToImageTest, WeightHeightLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
+  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
                                                    {64, 16, 11, 13});
 }
 
@@ -188,7 +188,7 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace
 
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
                                                           {11});
 }
 
@@ -233,7 +233,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
   const unsigned char input_data[] = {
       0xCD, 0x3C, 0x33, 0x40,
   };
-  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
                                                             {2}, input_data);
 }
 
diff --git a/mace/kernels/buffer_transform.cc b/mace/ops/buffer_transform.cc
similarity index 84%
rename from mace/kernels/buffer_transform.cc
rename to mace/ops/buffer_transform.cc
index 2b14698c..cb127880 100644
--- a/mace/kernels/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -15,11 +15,11 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/opencl/buffer/buffer_transform.h"
-#include "mace/kernels/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/buffer/buffer_transform.h"
+#include "mace/ops/opencl/image/buffer_to_image.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class BufferTransformOp;
@@ -41,9 +41,9 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
 
-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    ops::BufferType type =
+        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
 
     return kernel_->Compute(context, input, type,
                             wino_blk_size_, output);
@@ -63,5 +63,5 @@ void RegisterBufferTransform(OpRegistryBase *op_registry) {
                    BufferTransformOp, DeviceType::GPU, half);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc
index aff6855f..c768d671 100644
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -69,12 +69,12 @@ void TestBidirectionTransform(const int type,
 }  // namespace
 
 TEST_F(BufferTransformTest, FloatToHalf) {
-  TestBidirectionTransform<float, half>(kernels::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
                                         {1, 2, 3, 4});
 }
 
 TEST_F(BufferTransformTest, HalfToHalf) {
-  TestBidirectionTransform<half, half>(kernels::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
                                        {1, 2, 3, 4});
 }
 
@@ -85,7 +85,7 @@ void TestArgumentTransform(const index_t input_size) {
   OpDefBuilder("BufferTransform", "BufferTransformTest")
       .Input("Input")
       .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::ARGUMENT)
+      .AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
       .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
diff --git a/mace/kernels/cast.cc b/mace/ops/cast.cc
similarity index 97%
rename from mace/kernels/cast.cc
rename to mace/ops/cast.cc
index 0bd971e1..f215d80f 100644
--- a/mace/kernels/cast.cc
+++ b/mace/ops/cast.cc
@@ -15,7 +15,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename SrcType>
 class CastOp : public Operation {
@@ -57,5 +57,5 @@ void RegisterCast(OpRegistryBase *op_registry) {
                    DeviceType::CPU, int32_t);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/cast_test.cc b/mace/ops/cast_test.cc
index a0064993..666bc04e 100644
--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
similarity index 97%
rename from mace/kernels/channel_shuffle.cc
rename to mace/ops/channel_shuffle.cc
index 8258ea1c..78e6f7ad 100644
--- a/mace/kernels/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -16,11 +16,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/channel_shuffle.h"
+#include "mace/ops/opencl/image/channel_shuffle.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class ChannelShuffleOp;
@@ -115,5 +115,5 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index d45216eb..6707a5c9 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -44,7 +43,7 @@ void ChannelShuffle(
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
         .Input("InputImage")
diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc
index 1ce0cea1..ca301a1f 100644
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -61,7 +60,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
       {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
       .Input("InputImage")
@@ -74,7 +73,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
 
   // Transfer output
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   // Check
   auto expected = net.CreateTensor<float>(
diff --git a/mace/kernels/concat.cc b/mace/ops/concat.cc
similarity index 97%
rename from mace/kernels/concat.cc
rename to mace/ops/concat.cc
index de501192..0cebac68 100644
--- a/mace/kernels/concat.cc
+++ b/mace/ops/concat.cc
@@ -18,11 +18,11 @@
 #include "mace/utils/quantize.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/concat.h"
+#include "mace/ops/opencl/image/concat.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class ConcatOpBase : public Operation {
  public:
@@ -206,6 +206,9 @@ void RegisterConcat(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                    DeviceType::CPU, float);
 
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, int32_t);
+
   MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                    DeviceType::CPU, uint8_t);
 
@@ -218,5 +221,5 @@ void RegisterConcat(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 486d9b6e..02411591 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -90,9 +89,9 @@ void OpenclConcatHelper(int iters,
   net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Concat", "ConcatBM")
       .Input("InputImage0")
       .Input("InputImage1")
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index 431e7a2d..83307e78 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -262,7 +262,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
     net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
                                                   inputs[i]);
     BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
-                                      kernels::BufferType::IN_OUT_CHANNEL);
+                                      ops::BufferType::IN_OUT_CHANNEL);
   }
 
   auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -279,7 +279,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   // Check
   auto output = net.GetOutput("Output");
diff --git a/mace/kernels/conv_2d.cc b/mace/ops/conv_2d.cc
similarity index 98%
rename from mace/kernels/conv_2d.cc
rename to mace/ops/conv_2d.cc
index c6edbff6..74234b5e 100644
--- a/mace/kernels/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -26,20 +26,20 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/conv_2d_neon.h"
-#include "mace/kernels/arm/conv_winograd.h"
-#include "mace/kernels/conv_pool_2d_base.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/gemmlowp_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/conv_2d_neon.h"
+#include "mace/ops/arm/conv_winograd.h"
+#include "mace/ops/conv_pool_2d_base.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/gemmlowp_util.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/conv_2d.h"
-#include "mace/kernels/opencl/buffer/conv_2d.h"
+#include "mace/ops/opencl/image/conv_2d.h"
+#include "mace/ops/opencl/buffer/conv_2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class Conv2dOp;
@@ -49,7 +49,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
@@ -712,7 +712,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -950,7 +950,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
@@ -999,5 +999,5 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 76e3696d..96be2902 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,9 +14,8 @@
 
 #include <algorithm>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -66,11 +65,11 @@ void Conv2d(int iters,
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 28037011..db7f0458 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 #include <vector>
 
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -61,11 +61,11 @@ void TestNHWCSimple3x3VALID() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -81,7 +81,7 @@ void TestNHWCSimple3x3VALID() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -127,11 +127,11 @@ void TestNHWCSimple3x3SAME() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -213,9 +213,9 @@ void TestNHWCSimple3x3WithoutBias() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -230,7 +230,7 @@ void TestNHWCSimple3x3WithoutBias() {
     net.RunOp(D);
     // Transfer output
     ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -287,11 +287,11 @@ void TestNHWCCombined3x3() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
@@ -307,7 +307,7 @@ void TestNHWCCombined3x3() {
     net.RunOp(D);
 
     ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -362,11 +362,11 @@ void TestFusedNHWCSimple3x3VALID() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -383,7 +383,7 @@ void TestFusedNHWCSimple3x3VALID() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -425,9 +425,9 @@ void TestFusedNHWCSimple3x3WithoutBias() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
 
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
@@ -443,7 +443,7 @@ void TestFusedNHWCSimple3x3WithoutBias() {
     net.RunOp(D);
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -505,11 +505,11 @@ void TestConv1x1() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Filter", "FilterImage",
-                            kernels::BufferType::CONV2D_FILTER);
+                            ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
@@ -524,7 +524,7 @@ void TestConv1x1() {
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -596,11 +596,11 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -616,7 +616,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
     net.RunOp(D);
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
@@ -705,11 +705,11 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
 
     // run on gpu
     BufferToImage<D, half>(&net, "Input", "InputImage",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           kernels::BufferType::CONV2D_FILTER);
+                           ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           kernels::BufferType::ARGUMENT);
+                           ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -725,7 +725,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-2);
@@ -857,11 +857,11 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -877,7 +877,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
     net.RunOp(D);
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
@@ -954,11 +954,11 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
 
     // run on gpu
     BufferToImage<D, half>(&net, "Input", "InputImage",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           kernels::BufferType::CONV2D_FILTER);
+                           ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           kernels::BufferType::ARGUMENT);
+                           ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -974,7 +974,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-1);
   };
@@ -1041,11 +1041,11 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -1060,7 +1060,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
     net.RunOp(D);
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
diff --git a/mace/kernels/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
similarity index 85%
rename from mace/kernels/conv_pool_2d_base.h
rename to mace/ops/conv_pool_2d_base.h
index d1e59c61..2d886faa 100644
--- a/mace/kernels/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CONV_POOL_2D_BASE_H_
-#define MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
+#define MACE_OPS_CONV_POOL_2D_BASE_H_
 
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class ConvPool2dOpBase : public Operation {
  public:
@@ -40,7 +40,7 @@ class ConvPool2dOpBase : public Operation {
   std::vector<int> dilations_;
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#endif  // MACE_OPS_CONV_POOL_2D_BASE_H_
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/ops/conv_pool_2d_util.cc
similarity index 99%
rename from mace/kernels/conv_pool_2d_util.cc
rename to mace/ops/conv_pool_2d_util.cc
index c4669f4c..6ec025b9 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
@@ -463,5 +463,5 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
   return MaceStatus::MACE_SUCCESS;
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/ops/conv_pool_2d_util.h
similarity index 96%
rename from mace/kernels/conv_pool_2d_util.h
rename to mace/ops/conv_pool_2d_util.h
index e735a97d..0e45c31e 100644
--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/ops/conv_pool_2d_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CONV_POOL_2D_UTIL_H_
-#define MACE_KERNELS_CONV_POOL_2D_UTIL_H_
+#ifndef MACE_OPS_CONV_POOL_2D_UTIL_H_
+#define MACE_OPS_CONV_POOL_2D_UTIL_H_
 
 #include "mace/core/tensor.h"
 
@@ -30,7 +30,7 @@ enum RoundType {
   CEIL = 1,
 };
 
-namespace kernels {
+namespace ops {
 
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
@@ -113,7 +113,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input,
                                    Tensor *output_tensor,
                                    bool padding_same_value = false);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_CONV_POOL_2D_UTIL_H_
+#endif  // MACE_OPS_CONV_POOL_2D_UTIL_H_
diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc
index 5afd621f..3e3185b3 100644
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -29,7 +29,7 @@ TEST(CoreTest, INIT_MODE) {
   OpDefBuilder("BufferTransform", "BufferTransformTest")
       .Input("Input")
       .Output("B2IOutput")
-      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
+      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
       .AddIntArg("mode", static_cast<int>(NetMode::INIT))
       .Finalize(&op_defs[op_defs.size() - 1]);
 
@@ -46,18 +46,16 @@ TEST(CoreTest, INIT_MODE) {
   OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
       .Input("B2IOutput")
       .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
+      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
       .Finalize(&op_defs[op_defs.size() - 1]);
 
   NetDef net_def;
   for (auto &op_def : op_defs) {
     net_def.add_op()->CopyFrom(op_def);
-    net_def.add_op_types(op_def.type());
   }
-  std::shared_ptr<OpDefRegistryBase> op_def_registry(new OpDefRegistry());
-  std::shared_ptr<OpRegistryBase> op_registry(new OpRegistry());
+  std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
   auto net = std::unique_ptr<NetBase>(new SerialNet(
-      op_def_registry.get(), op_registry.get(), &net_def, &ws, device,
+      op_registry.get(), &net_def, &ws, device,
       NetMode::INIT));
   MaceStatus status = net->Init();
   MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
@@ -67,7 +65,7 @@ TEST(CoreTest, INIT_MODE) {
   EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
   EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
   net = std::unique_ptr<NetBase>(new SerialNet(
-      op_def_registry.get(), op_registry.get(), &net_def, &ws, device));
+      op_registry.get(), &net_def, &ws, device));
   status = net->Init();
   MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
   status = net->Run();
diff --git a/mace/kernels/crop.cc b/mace/ops/crop.cc
similarity index 98%
rename from mace/kernels/crop.cc
rename to mace/ops/crop.cc
index 6b1ffa6a..b056f21c 100644
--- a/mace/kernels/crop.cc
+++ b/mace/ops/crop.cc
@@ -16,11 +16,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/crop.h"
+#include "mace/ops/opencl/image/crop.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class CropOp : public Operation {
@@ -143,5 +143,5 @@ void RegisterCrop(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index 75cd494f..b186cecc 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -81,9 +80,9 @@ void OpenclCropHelper(int iters,
   net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Crop", "CropBM")
       .Input("InputImage0")
       .Input("InputImage1")
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index 67a2fdeb..efada981 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -36,9 +35,9 @@ void RunCrop(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     BufferToImage<D, float>(&net, "Input0", "InputImage0",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Input1", "InputImage1",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Crop", "CropTest")
         .Input("InputImage0")
         .Input("InputImage1")
@@ -69,7 +68,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
diff --git a/mace/kernels/deconv_2d.cc b/mace/ops/deconv_2d.cc
similarity index 97%
rename from mace/kernels/deconv_2d.cc
rename to mace/ops/deconv_2d.cc
index 44c0c119..0bfa8200 100644
--- a/mace/kernels/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/deconv_2d.h"
+#include "mace/ops/deconv_2d.h"
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -27,16 +27,16 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/deconv_2d.h"
+#include "mace/ops/opencl/image/deconv_2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class Deconv2dOpBase : public Operation {
  public:
@@ -46,9 +46,9 @@ class Deconv2dOpBase : public Operation {
         padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
             "padding", static_cast<int>(SAME)))),
         paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
-        model_type_(static_cast<kernels::FrameworkType>(
+        model_type_(static_cast<ops::FrameworkType>(
                         Operation::GetOptionalArg<int>("framework_type", 0))),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -180,7 +180,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
     const Tensor *filter = this->Input(1);
     const Tensor *bias = nullptr;
     const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == kernels::CAFFE) {
+    if (model_type_ == ops::CAFFE) {
       bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
     } else {
       output_shape_tensor =
@@ -491,7 +491,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
     const Tensor *filter = this->Input(1);
     const Tensor *bias = nullptr;
     const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == kernels::CAFFE) {
+    if (model_type_ == ops::CAFFE) {
       bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
     } else {
       output_shape_tensor =
@@ -557,5 +557,5 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/deconv_2d.h b/mace/ops/deconv_2d.h
similarity index 82%
rename from mace/kernels/deconv_2d.h
rename to mace/ops/deconv_2d.h
index 25413d98..35dcee8b 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_DECONV_2D_H_
-#define MACE_KERNELS_DECONV_2D_H_
+#ifndef MACE_OPS_DECONV_2D_H_
+#define MACE_OPS_DECONV_2D_H_
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 enum FrameworkType {
   TENSORFLOW = 0,
   CAFFE = 1,
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_DECONV_2D_H_
+#endif  // MACE_OPS_DECONV_2D_H_
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index 197e8f73..175feaca 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,9 +14,8 @@
 
 #include <algorithm>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -54,11 +53,11 @@ static void Deconv2d(int iters,
                                     {batch, out_h, out_w, output_channels});
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Deconv2D", "Deconv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index 88476414..a33b2f7b 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -15,8 +15,8 @@
 #include <fstream>
 #include <vector>
 
-#include "mace/kernels/deconv_2d.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/deconv_2d.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -38,7 +38,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                    const std::vector<float> &filter_data,
                    const std::vector<index_t> &expected_shape,
                    const std::vector<float> &expected_data,
-                   kernels::FrameworkType model_type) {
+                   ops::FrameworkType model_type) {
   OpsTestNet net;
   // Add input data
   const index_t batch = input_shape[0];
@@ -50,12 +50,12 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
   net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            kernels::BufferType::CONV2D_FILTER);
-    if (model_type == kernels::FrameworkType::CAFFE) {
+                            ops::BufferType::CONV2D_FILTER);
+    if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputImage")
           .Input("FilterImage")
@@ -85,12 +85,12 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
 
-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputNCHW")
           .Input("FilterOIHW")
@@ -138,7 +138,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7,
                     6.5, 6.6, 6.7, 9.5, 9.6, 9.7, 6.5, 6.6, 6.7,
                     4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                    1, Padding::VALID, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {1, 3, 3, 3},
                    {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
                     9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    1, Padding::SAME, {},
                    {1, 3, 3, 3}, {3, 3, 3, 1},
@@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {54,  66,  78,  126, 147, 168, 130, 146, 162,
                     198, 225, 252, 405, 450, 495, 366, 399, 432,
                     354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    1, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -167,7 +167,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {54,  66,  78,  126, 147, 168, 130, 146, 162,
                     198, 225, 252, 405, 450, 495, 366, 399, 432,
                     354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
 }
 
 template <DeviceType D>
@@ -185,7 +185,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                    2, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -198,7 +198,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                     1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    2, Padding::SAME, {},
                    {1, 6, 6, 3}, {3, 3, 3, 1},
@@ -216,7 +216,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168,
                     304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192,
                     207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    2, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -229,7 +229,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300,
                     142, 155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105,
                     192, 207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   kernels::FrameworkType::CAFFE);
+                   ops::FrameworkType::CAFFE);
 }
 
 template <DeviceType D>
@@ -246,7 +246,7 @@ void TestNHWCSimple3x3SAME_S2_1() {
                     18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18,
                     30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
                     12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -271,7 +271,7 @@ void TestNHWCSimple3x3VALID_S2() {
                     1, 1, 1,
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     1, 1, 1},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -288,7 +288,7 @@ void TestNHWCSimple3x3VALID_S1() {
                     366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402,
                     630, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147,
                     306, 321, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -297,7 +297,7 @@ void TestNHWCSimple2x2SAME() {
                    {1, 2, 2, 1}, {3, 3, 1, 1},
                    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
                    {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f},
-                   kernels::FrameworkType::TENSORFLOW);
+                   ops::FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -308,7 +308,7 @@ void TestNHWCSimple2x2VALID() {
       {1, 5, 5, 1},
       {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
        2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f},
-      kernels::FrameworkType::TENSORFLOW);
+      ops::FrameworkType::TENSORFLOW);
 }
 }  // namespace
 
@@ -397,11 +397,11 @@ void TestComplexDeconvNxNS12(const int batch,
     std::vector<int> paddings;
     std::vector<int> output_shape;
 
-    kernels::FrameworkType model_type =
+    ops::FrameworkType model_type =
         padding < 0 ?
-        kernels::FrameworkType::TENSORFLOW : kernels::FrameworkType::CAFFE;
+        ops::FrameworkType::TENSORFLOW : ops::FrameworkType::CAFFE;
 
-    if (model_type == kernels::FrameworkType::TENSORFLOW) {
+    if (model_type == ops::FrameworkType::TENSORFLOW) {
       if (type == Padding::SAME) {
         out_h = (height - 1) * stride_h + 1;
         out_w = (width - 1) * stride_w + 1;
@@ -421,7 +421,7 @@ void TestComplexDeconvNxNS12(const int batch,
       paddings.push_back(padding);
     }
 
-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputNCHW")
           .Input("Filter")
@@ -458,13 +458,13 @@ void TestComplexDeconvNxNS12(const int batch,
 
     // run on gpu
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
+                        ops::BufferType::CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
-    if (model_type == kernels::FrameworkType::CAFFE) {
+    if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputImage")
           .Input("FilterImage")
@@ -492,7 +492,7 @@ void TestComplexDeconvNxNS12(const int batch,
     net.RunOp(D);
 
     ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
diff --git a/mace/kernels/depth_to_space.cc b/mace/ops/depth_to_space.cc
similarity index 97%
rename from mace/kernels/depth_to_space.cc
rename to mace/ops/depth_to_space.cc
index cd10b2b0..be7a2f82 100644
--- a/mace/kernels/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/depth_to_space.h"
+#include "mace/ops/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class DepthToSpaceOp : public Operation {
@@ -127,5 +127,5 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index 45bc6036..822bf8f0 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -44,7 +43,7 @@ void DepthToSpace(
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
         .Input("InputImage")
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index fdce99c1..aa9b9c28 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -15,7 +15,6 @@
 #include <fstream>
 
 #include <vector>
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -47,7 +46,7 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
 
   } else {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -59,7 +58,7 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
 
   if (D == DeviceType::GPU) {
     ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          kernels::BufferType::IN_OUT_CHANNEL);
+                                          ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -136,7 +135,7 @@ void RandomTest(const int block_size,
                                                   NHWC);
 
   BufferToImage<D, T>(&net, "Input", "InputImg",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
 
   OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
       .Input("InputImg")
@@ -149,7 +148,7 @@ void RandomTest(const int block_size,
   net.RunOp(D);
 
   ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
diff --git a/mace/kernels/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
similarity index 98%
rename from mace/kernels/depthwise_conv2d.cc
rename to mace/ops/depthwise_conv2d.cc
index 74def6cf..76eee2f2 100644
--- a/mace/kernels/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -26,24 +26,24 @@
 
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/depthwise_conv2d_neon.h"
-#include "mace/kernels/conv_pool_2d_base.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/depthwise_conv2d_neon.h"
+#include "mace/ops/conv_pool_2d_base.h"
 #include "mace/public/mace.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/depthwise_conv2d.h"
-#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
  public:
   explicit DepthwiseConv2dOpBase(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -532,5 +532,5 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 3257e580..54f3e8b7 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,9 +14,8 @@
 
 #include <algorithm>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -81,11 +80,11 @@ void DepthwiseConv2d(int iters,
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 3089286c..dfb76b44 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -53,11 +53,11 @@ void SimpleValidTest() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -73,7 +73,7 @@ void SimpleValidTest() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -150,11 +150,11 @@ void ComplexValidTest(index_t batch,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::DW_CONV2D_FILTER);
+                        ops::BufferType::DW_CONV2D_FILTER);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -170,7 +170,7 @@ void ComplexValidTest(index_t batch,
 
     // Transfer output
     ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
 
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -290,11 +290,11 @@ void TestNxNS12(const index_t height, const index_t width) {
     expected->Copy(*net.GetOutput("Output"));
 
     BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                      kernels::BufferType::IN_OUT_CHANNEL);
+                                      ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
-                                      kernels::BufferType::DW_CONV2D_FILTER);
+                                      ops::BufferType::DW_CONV2D_FILTER);
     BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                                      kernels::BufferType::ARGUMENT);
+                                      ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -312,7 +312,7 @@ void TestNxNS12(const index_t height, const index_t width) {
 
     // Transfer output
     ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
-                                          kernels::BufferType::IN_OUT_CHANNEL);
+                                          ops::BufferType::IN_OUT_CHANNEL);
 
     // Check
     if (DataTypeToEnum<T>::value == DT_FLOAT) {
diff --git a/mace/kernels/eltwise.cc b/mace/ops/eltwise.cc
similarity index 98%
rename from mace/kernels/eltwise.cc
rename to mace/ops/eltwise.cc
index e33006ea..bb7532cc 100644
--- a/mace/kernels/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 
 #include <algorithm>
 #include <cmath>
@@ -26,11 +26,11 @@
 #include "mace/core/tensor.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/eltwise.h"
+#include "mace/ops/opencl/image/eltwise.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 
 inline index_t GetIndex(const std::vector<index_t> &shape,
@@ -792,8 +792,8 @@ class EltwiseOp : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
       : Operation(context),
-        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        type_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE)))),
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
@@ -934,8 +934,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
       : Operation(context),
-        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        type_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE)))),
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
@@ -1076,9 +1076,9 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
       : Operation(context) {
-    EltwiseType type = static_cast<kernels::EltwiseType>(
+    EltwiseType type = static_cast<ops::EltwiseType>(
         Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)));
+            "type", static_cast<int>(ops::EltwiseType::NONE)));
     std::vector<float> coeff = Operation::GetRepeatedArgs<float>("coeff");
     float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
     int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
@@ -1121,5 +1121,5 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/eltwise.h b/mace/ops/eltwise.h
similarity index 86%
rename from mace/kernels/eltwise.h
rename to mace/ops/eltwise.h
index b71f4e42..31ee93f5 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ELTWISE_H_
-#define MACE_KERNELS_ELTWISE_H_
+#ifndef MACE_OPS_ELTWISE_H_
+#define MACE_OPS_ELTWISE_H_
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 enum EltwiseType {
   SUM = 0,
@@ -35,7 +35,7 @@ enum EltwiseType {
 
 inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_ELTWISE_H_
+#endif  // MACE_OPS_ELTWISE_H_
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 4a8fa041..82fbc63f 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -14,9 +14,8 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -26,7 +25,7 @@ namespace test {
 namespace {
 template <DeviceType D, typename T>
 void EltwiseBenchmark(
-    int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
+    int iters, ops::EltwiseType type, int n, int h, int w, int c) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -36,9 +35,9 @@ void EltwiseBenchmark(
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, half>(&net, "Input0", "InputImg0",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, half>(&net, "Input1", "InputImg1",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("InputImg0")
         .Input("InputImg1")
@@ -84,7 +83,7 @@ void EltwiseBenchmark(
     mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     EltwiseBenchmark<DEVICE, TYPE>(                                           \
-        iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C);      \
+        iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C);      \
   }                                                                           \
   MACE_BENCHMARK(                                                             \
       MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index da9687ce..ac920ac0 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/eltwise.h"
-#include "mace/core/op_def_registry.h"
+#include <vector>
+
+#include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -24,7 +25,7 @@ class EltwiseOpTest : public OpsTestBase {};
 
 namespace {
 template <DeviceType D, typename T, typename DstType>
-void SimpleScalarScalar(const kernels::EltwiseType type,
+void SimpleScalarScalar(const ops::EltwiseType type,
                         const T input,
                         const float x,
                         const DstType output) {
@@ -40,7 +41,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
         .AddIntArg("T", DataTypeToEnum<T>::v())
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatArg("scalar_input", x)
-        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+        .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
         .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -55,7 +56,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
 }
 
 template <DeviceType D, typename T, typename DstType>
-void SimpleTensorScalar(const kernels::EltwiseType type,
+void SimpleTensorScalar(const ops::EltwiseType type,
                         const std::vector<index_t> &shape,
                         const std::vector<T> &input,
                         const float x,
@@ -74,7 +75,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatArg("scalar_input", x)
         .AddIntArg("data_format", DataFormat::NCHW)
-        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+        .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
         .Output("TOutput")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -82,7 +83,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
     net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
   } else {
     BufferToImage<D, T>(&net, "Input", "InputImg",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("InputImg")
         .AddIntArg("type", static_cast<int>(type))
@@ -94,7 +95,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
     net.RunOp(D);
 
     ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
   }
 
   auto expected = net.CreateTensor<DstType>(shape, output);
@@ -103,7 +104,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
 }
 
 template <DeviceType D, typename T, typename DstType>
-void SimpleTensorEltwise(const kernels::EltwiseType type,
+void SimpleTensorEltwise(const ops::EltwiseType type,
                          const std::vector<index_t> &shape0,
                          const std::vector<T> &input0,
                          const std::vector<index_t> &shape1,
@@ -124,7 +125,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
             .AddIntArg("type", static_cast<int>(type))
             .AddFloatsArg("coeff", coeff)
             .AddIntArg("data_format", DataFormat::NCHW)
-            .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
             .Output("TOutput");
     if (shape0.size() > 1) {
       net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
@@ -145,9 +146,9 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
     net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
   } else {
     BufferToImage<D, T>(&net, "Input0", "InputImg0",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Input1", "InputImg1",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("InputImg0")
         .Input("InputImg1")
@@ -160,7 +161,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
     net.RunOp(D);
 
     ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
   }
 
   std::vector<index_t> output_shape = shape0;
@@ -173,7 +174,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
 }
 
 template <DeviceType D, typename T, typename DstType>
-void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
+void TensorGeneralBroadcastEltwise(const ops::EltwiseType type,
                                    const std::vector<index_t> &shape0,
                                    const std::vector<T> &input0,
                                    const std::vector<index_t> &shape1,
@@ -196,7 +197,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
             .Input("Input1")
             .AddIntArg("type", static_cast<int>(type))
             .AddFloatsArg("coeff", coeff)
-            .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
             .Output("Output");
     op_builder.Finalize(net.NewOperatorDef());
 
@@ -204,9 +205,9 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
     net.RunOp(D);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input0", "InputImage0",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Input1", "InputImage1",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     auto op_builder =
         OpDefBuilder("Eltwise", "EltwiseTest")
             .AddIntArg("T", DataTypeToEnum<T>::v())
@@ -214,7 +215,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
             .Input("InputImage1")
             .AddIntArg("type", static_cast<int>(type))
             .AddFloatsArg("coeff", coeff)
-            .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
             .Output("OutputImage");
     op_builder.Finalize(net.NewOperatorDef());
 
@@ -222,7 +223,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -234,249 +235,249 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
 
 TEST_F(EltwiseOpTest, CPUSimpleScalarScalar) {
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, 1, 2, 3);
+      ops::EltwiseType::SUM, 1, 2, 3);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, 1, 2, -1);
+      ops::EltwiseType::SUB, 1, 2, -1);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, 1, 2, 2);
+      ops::EltwiseType::PROD, 1, 2, 2);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, 1, 2, 0.5);
+      ops::EltwiseType::DIV, 1, 2, 0.5);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, 1, 2, 1);
+      ops::EltwiseType::MIN, 1, 2, 1);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, 1, 2, 2);
+      ops::EltwiseType::MAX, 1, 2, 2);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::NEG, 1, 2, -1);
+      ops::EltwiseType::NEG, 1, 2, -1);
   SimpleScalarScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::ABS, -1, 3, 1);
+      ops::EltwiseType::ABS, -1, 3, 1);
   SimpleScalarScalar<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, 1, 3, 0);
+      ops::EltwiseType::EQUAL, 1, 3, 0);
   SimpleScalarScalar<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, 3, 3, 1);
+      ops::EltwiseType::EQUAL, 3, 3, 1);
 }
 
 TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::CPU, float, float>(kernels::EltwiseType::SUM,
+  SimpleTensorScalar<DeviceType::CPU, float, float>(ops::EltwiseType::SUM,
                                                     {1, 1, 1, 1}, {1}, 1, {2});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {0, 1, 2, 3, 4, 5});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
+      ops::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
       {2, 4, 6, 8, 10, 12});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
+      ops::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
       {1, 2, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {1, 1, 1, 1, 1, 1});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      ops::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
       {3, 3, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      ops::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
       {-1, -2, -3, -4, -5, -6});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
+      ops::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
       {1, 2, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {0, 1, 4, 9, 16, 25});
   SimpleTensorScalar<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      ops::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
       {0, 0, 1, 0, 0, 0});
 }
 
 TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::GPU, float, float>(kernels::EltwiseType::SUM,
+  SimpleTensorScalar<DeviceType::GPU, float, float>(ops::EltwiseType::SUM,
                                                     {1, 1, 1, 1}, {1}, 1, {2});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {0, 1, 2, 3, 4, 5});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
+      ops::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 2,
       {2, 4, 6, 8, 10, 12});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
+      ops::EltwiseType::DIV, {1, 1, 2, 3}, {2, 4, 6, 8, 10, 12}, 2,
       {1, 2, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {1, 1, 1, 1, 1, 1});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      ops::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
       {3, 3, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
+      ops::EltwiseType::NEG, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 3,
       {-1, -2, -3, -4, -5, -6});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
+      ops::EltwiseType::ABS, {1, 1, 2, 3}, {-1, -2, -3, -4, -5, -6}, 3,
       {1, 2, 3, 4, 5, 6});
   SimpleTensorScalar<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
+      ops::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, 1,
       {0, 1, 4, 9, 16, 25});
 }
 
 TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
       {1, 2, 3}, {2, 4, 6, 5, 7, 9});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
+      ops::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
       {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      ops::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
       {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
+      ops::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
   SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 1, 3}, {1, 2, 3}, {1, 1, 1, 0, 0, 0});
 
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
       {1, 2, 3}, {2, 4, 6, 5, 7, 9});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
+      ops::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
       {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      ops::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
       {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
   SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
+      ops::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
       {1, 2, 3}, {1, 1, 1, 0, 0, 0});
 }
 
 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 3},
       {1, 2, 3}, {2, 4, 6, 5, 7, 9});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
+      ops::EltwiseType::PROD, {1, 1, 1, 3}, {1, 2, 3}, {1, 2, 1, 3},
       {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
+      ops::EltwiseType::DIV, {1, 1, 1, 5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
       {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
+      ops::EltwiseType::MIN, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
+      ops::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
 
 TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
       {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
       {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
+      ops::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
       {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
       {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
+      ops::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
       {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
+      ops::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   SimpleTensorEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
+      ops::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
   SimpleTensorEltwise<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
       {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 3},
       {1, 2, 3, 4, 5, 6}, {0.2, 0.4, 0.6, 0.8, 1, 1.2}, {0.1, 0.1});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
+      ops::EltwiseType::SUB, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5},
       {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::PROD, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
       {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
+      ops::EltwiseType::DIV, {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 1, 3},
       {1, 2, 3, 4, 5, 6}, {1, 1, 1, 1, 1, 1});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
+      ops::EltwiseType::MIN, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      ops::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
       {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   SimpleTensorEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
+      ops::EltwiseType::SQR_DIFF, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, {1, 2, 1, 5},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
 
 namespace {
 template <typename T>
-void RandomTensorScalar(const kernels::EltwiseType type,
+void RandomTensorScalar(const ops::EltwiseType type,
                         const std::vector<index_t> &shape) {
   // Construct graph
   OpsTestNet net;
@@ -501,7 +502,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
-                                    kernels::BufferType::IN_OUT_CHANNEL);
+                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("InputImg")
       .AddIntArg("type", static_cast<int>(type))
@@ -514,7 +515,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
@@ -524,7 +525,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
 }
 
 template <typename T>
-void RandomTensorEltwise(const kernels::EltwiseType type,
+void RandomTensorEltwise(const ops::EltwiseType type,
                          const std::vector<index_t> &shape0,
                          const std::vector<index_t> &shape1,
                          const std::vector<float> &coeff = {}) {
@@ -556,9 +557,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
-                                    kernels::BufferType::IN_OUT_CHANNEL);
+                                    ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
-                                    kernels::BufferType::IN_OUT_CHANNEL);
+                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("InputImg0")
       .Input("InputImg1")
@@ -572,7 +573,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
@@ -597,7 +598,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput0")
       .Input("TInput1")
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
       .AddIntArg("data_format", DataFormat::NCHW)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
@@ -638,7 +639,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
       .Input("QuantizedInput0")
       .Input("QuantizedInput1")
       .Output("QuantizedOutput")
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
       .AddIntArg("T", static_cast<int>(DT_UINT8))
       .Finalize(net.NewOperatorDef());
   net.Setup(DeviceType::CPU);
@@ -663,159 +664,159 @@ void QuantizedSum(const std::vector<index_t> &shape) {
 }  // namespace
 
 TEST_F(EltwiseOpTest, RandomTensorScalarFloat) {
-  RandomTensorScalar<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16});
-  RandomTensorScalar<float>(kernels::EltwiseType::SUB, {3, 32, 32, 16});
-  RandomTensorScalar<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17});
-  RandomTensorScalar<float>(kernels::EltwiseType::DIV, {3, 31, 37, 17});
-  RandomTensorScalar<float>(kernels::EltwiseType::MIN, {1, 32, 32, 16});
-  RandomTensorScalar<float>(kernels::EltwiseType::MAX, {3, 31, 37, 17});
-  RandomTensorScalar<float>(kernels::EltwiseType::NEG, {1, 32, 32, 32});
-  RandomTensorScalar<float>(kernels::EltwiseType::ABS, {3, 31, 37, 17});
-  RandomTensorScalar<float>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17});
+  RandomTensorScalar<float>(ops::EltwiseType::SUM, {1, 32, 32, 16});
+  RandomTensorScalar<float>(ops::EltwiseType::SUB, {3, 32, 32, 16});
+  RandomTensorScalar<float>(ops::EltwiseType::PROD, {1, 31, 37, 17});
+  RandomTensorScalar<float>(ops::EltwiseType::DIV, {3, 31, 37, 17});
+  RandomTensorScalar<float>(ops::EltwiseType::MIN, {1, 32, 32, 16});
+  RandomTensorScalar<float>(ops::EltwiseType::MAX, {3, 31, 37, 17});
+  RandomTensorScalar<float>(ops::EltwiseType::NEG, {1, 32, 32, 32});
+  RandomTensorScalar<float>(ops::EltwiseType::ABS, {3, 31, 37, 17});
+  RandomTensorScalar<float>(ops::EltwiseType::SQR_DIFF, {3, 31, 37, 17});
 }
 
 TEST_F(EltwiseOpTest, RandomTensorScalarHalf) {
-  RandomTensorScalar<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16});
-  RandomTensorScalar<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16});
-  RandomTensorScalar<half>(kernels::EltwiseType::PROD, {1, 31, 37, 17});
-  RandomTensorScalar<half>(kernels::EltwiseType::DIV, {3, 31, 37, 17});
-  RandomTensorScalar<half>(kernels::EltwiseType::MIN, {1, 32, 32, 16});
-  RandomTensorScalar<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17});
-  RandomTensorScalar<half>(kernels::EltwiseType::NEG, {1, 32, 32, 32});
-  RandomTensorScalar<half>(kernels::EltwiseType::ABS, {3, 31, 37, 17});
-  RandomTensorScalar<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17});
+  RandomTensorScalar<half>(ops::EltwiseType::SUM, {1, 32, 32, 16});
+  RandomTensorScalar<half>(ops::EltwiseType::SUB, {3, 32, 32, 16});
+  RandomTensorScalar<half>(ops::EltwiseType::PROD, {1, 31, 37, 17});
+  RandomTensorScalar<half>(ops::EltwiseType::DIV, {3, 31, 37, 17});
+  RandomTensorScalar<half>(ops::EltwiseType::MIN, {1, 32, 32, 16});
+  RandomTensorScalar<half>(ops::EltwiseType::MAX, {3, 31, 37, 17});
+  RandomTensorScalar<half>(ops::EltwiseType::NEG, {1, 32, 32, 32});
+  RandomTensorScalar<half>(ops::EltwiseType::ABS, {3, 31, 37, 17});
+  RandomTensorScalar<half>(ops::EltwiseType::SQR_DIFF, {3, 31, 37, 17});
 }
 
 TEST_F(EltwiseOpTest, RandomTensorVecFloat) {
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUM, {1, 32, 32, 16},
                              {1, 1, 1, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUB, {5, 32, 32, 16},
                              {5, 1, 1, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUB, {5, 32, 32, 16},
                              {1, 1, 1, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {5, 1, 1, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUB, {5, 1, 1, 16},
                              {5, 32, 32, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::PROD, {1, 31, 37, 17},
                              {1, 1, 1, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::PROD, {1, 1, 1, 17},
                              {1, 31, 37, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {3, 1, 1, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::DIV, {3, 1, 1, 17},
                              {3, 31, 37, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::MIN, {1, 1, 1, 16},
                              {1, 32, 32, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {5, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::MAX, {5, 31, 37, 17},
                              {5, 1, 1, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {5, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::SQR_DIFF, {5, 31, 37, 17},
                              {5, 1, 1, 17});
 }
 
 TEST_F(EltwiseOpTest, RandomTensorVecHalf) {
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUM, {1, 32, 32, 16},
                             {1, 1, 1, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUB, {3, 32, 32, 16},
                             {3, 1, 1, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUB, {3, 32, 32, 16},
                             {1, 1, 1, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 1, 1, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUB, {3, 1, 1, 16},
                             {3, 32, 32, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 1, 1, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::PROD, {1, 1, 1, 17},
                             {1, 31, 37, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::DIV, {5, 31, 37, 17},
                             {5, 1, 1, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::DIV, {5, 31, 37, 17},
                             {1, 1, 1, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 1, 1, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::DIV, {5, 1, 1, 17},
                             {5, 31, 37, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 1, 1, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::MIN, {1, 1, 1, 16},
                             {1, 32, 32, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::MAX, {3, 31, 37, 17},
                             {3, 1, 1, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
                             {3, 1, 1, 17});
 }
 
 TEST_F(EltwiseOpTest, RandomTensorTensorFloat) {
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUM, {1, 32, 32, 16},
                              {1, 32, 32, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::SUB, {3, 32, 32, 16},
                              {3, 32, 32, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::PROD, {1, 31, 37, 17},
                              {1, 31, 37, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::DIV, {5, 31, 37, 17},
                              {5, 31, 37, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
+  RandomTensorEltwise<float>(ops::EltwiseType::MIN, {1, 32, 32, 16},
                              {1, 32, 32, 16});
-  RandomTensorEltwise<float>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::MAX, {3, 31, 37, 17},
                              {3, 31, 37, 17});
-  RandomTensorEltwise<float>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
+  RandomTensorEltwise<float>(ops::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
                              {3, 31, 37, 17});
 }
 
 TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUM, {1, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUM, {1, 32, 32, 16},
                             {1, 32, 32, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SUB, {3, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::SUB, {3, 32, 32, 16},
                             {3, 32, 32, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::PROD, {1, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::PROD, {1, 31, 37, 17},
                             {1, 31, 37, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::DIV, {5, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::DIV, {5, 31, 37, 17},
                             {5, 31, 37, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::MIN, {1, 32, 32, 16},
+  RandomTensorEltwise<half>(ops::EltwiseType::MIN, {1, 32, 32, 16},
                             {1, 32, 32, 16});
-  RandomTensorEltwise<half>(kernels::EltwiseType::MAX, {3, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::MAX, {3, 31, 37, 17},
                             {3, 31, 37, 17});
-  RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
+  RandomTensorEltwise<half>(ops::EltwiseType::SQR_DIFF, {3, 31, 37, 17},
                             {3, 31, 37, 17});
 }
 
 TEST_F(EltwiseOpTest, TensorGeneralBroadcastCPU) {
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {2, 3, 4, 6, 7, 8});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {0, 1, 2, 2, 3, 4});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 8, 10, 12});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::DIV, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 2, 2.5, 3});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 1, 1, 2, 2, 2});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {0, 1, 4, 4, 9, 16});
   TensorGeneralBroadcastEltwise<DeviceType::CPU, int32_t, int32_t>(
-      kernels::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::EQUAL, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 0, 0, 0, 0, 0});
 }
 
 TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {2, 3, 4, 6, 7, 8});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::SUB, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {0, 1, 2, 2, 3, 4});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::PROD, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 8, 10, 12});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::DIV, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 2, 2.5, 3});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::MIN, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 1, 1, 2, 2, 2});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
+      ops::EltwiseType::MAX, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 2, 1},
       {1, 2}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
   TensorGeneralBroadcastEltwise<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
+      ops::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
       {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {0, 1, 4, 4, 9, 16});
 }
 
diff --git a/mace/kernels/expand_dims.cc b/mace/ops/expand_dims.cc
similarity index 98%
rename from mace/kernels/expand_dims.cc
rename to mace/ops/expand_dims.cc
index 5dc58436..a912e0c9 100644
--- a/mace/kernels/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -16,7 +16,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class ExpandDimsOp;
@@ -67,5 +67,5 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/expand_dims_test.cc b/mace/ops/expand_dims_test.cc
index ac3312ea..35acb4f1 100644
--- a/mace/ops/expand_dims_test.cc
+++ b/mace/ops/expand_dims_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/fill.cc b/mace/ops/fill.cc
similarity index 97%
rename from mace/kernels/fill.cc
rename to mace/ops/fill.cc
index 0cd20930..b554c003 100644
--- a/mace/kernels/fill.cc
+++ b/mace/ops/fill.cc
@@ -16,7 +16,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class FillOp;
@@ -66,5 +66,5 @@ void RegisterFill(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/fill_test.cc b/mace/ops/fill_test.cc
index 8ecbed5d..5fde9968 100644
--- a/mace/ops/fill_test.cc
+++ b/mace/ops/fill_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/fixpoint.h b/mace/ops/fixpoint.h
similarity index 92%
rename from mace/kernels/fixpoint.h
rename to mace/ops/fixpoint.h
index 47f0a8d8..1d0ef0b9 100644
--- a/mace/kernels/fixpoint.h
+++ b/mace/ops/fixpoint.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_FIXPOINT_H_
-#define MACE_KERNELS_FIXPOINT_H_
+#ifndef MACE_OPS_FIXPOINT_H_
+#define MACE_OPS_FIXPOINT_H_
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -23,7 +23,7 @@
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 inline uint8_t FindMax(const uint8_t *xs, const index_t size) {
   uint8_t max_value = 0;
@@ -57,8 +57,8 @@ inline uint8_t FindMax(const uint8_t *xs, const index_t size) {
 }
 
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_FIXPOINT_H_
+#endif  // MACE_OPS_FIXPOINT_H_
 
diff --git a/mace/kernels/fixpoint_test.cc b/mace/ops/fixpoint_test.cc
similarity index 94%
rename from mace/kernels/fixpoint_test.cc
rename to mace/ops/fixpoint_test.cc
index 8b926cd9..83189695 100644
--- a/mace/kernels/fixpoint_test.cc
+++ b/mace/ops/fixpoint_test.cc
@@ -17,10 +17,10 @@
 #include <vector>
 #include <algorithm>
 
-#include "mace/kernels/fixpoint.h"
+#include "mace/ops/fixpoint.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace test {
 
 namespace {
@@ -49,6 +49,6 @@ TEST(FixpointTest, FindMax) {
 }
 
 }  // namespace test
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index a19d7d77..35e69cd1 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -62,11 +61,11 @@ void Simple() {
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
     BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
         .Input("InputImage")
@@ -79,7 +78,7 @@ void Simple() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -134,11 +133,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
@@ -152,7 +151,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-5, 1e-4);
 }
@@ -196,11 +195,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
@@ -215,7 +214,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.Sync();
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-2, 1e-2);
 }
@@ -259,11 +258,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        kernels::BufferType::ARGUMENT);
+                                        ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
@@ -276,7 +275,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-5, 1e-4);
 }
@@ -320,11 +319,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       kernels::BufferType::IN_OUT_CHANNEL);
+                                       ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       kernels::BufferType::ARGUMENT);
+                                       ops::BufferType::ARGUMENT);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
@@ -338,7 +337,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
                           1e-2, 1e-2);
 }
diff --git a/mace/kernels/fully_connected.cc b/mace/ops/fully_connected.cc
similarity index 96%
rename from mace/kernels/fully_connected.cc
rename to mace/ops/fully_connected.cc
index a7b74c69..c5a8872b 100644
--- a/mace/kernels/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -19,22 +19,22 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/gemm.h"
+#include "mace/ops/gemmlowp_util.h"
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/fully_connected.h"
+#include "mace/ops/opencl/image/fully_connected.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class FullyConnectedOpBase : public Operation {
  public:
   explicit FullyConnectedOpBase(OpConstructContext *context)
       : Operation(context),
-        activation_(kernels::StringToActivationType(
+        activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
@@ -229,5 +229,5 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index 66af8792..6b75e60d 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -14,7 +14,6 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -48,13 +47,13 @@ void FCBenchmark(
       .Output("Output")
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
-    kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
+    ops::BufferType weight_type = ops::BufferType::WEIGHT_WIDTH;
     BufferToImage<D, T>(&net, "Weight", "WeightImage",
                         weight_type);
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
 
     OpDefBuilder("FullyConnected", "FullyConnectedTest")
         .Input("InputImage")
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index d075aac2..0fd98848 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -14,7 +14,6 @@
 
 #include <fstream>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -52,11 +51,11 @@ void Simple(const std::vector<index_t> &input_shape,
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Weight", "WeightImage",
-                            kernels::BufferType::WEIGHT_WIDTH);
+                            ops::BufferType::WEIGHT_WIDTH);
     BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("FullyConnected", "FullyConnectedTest")
         .Input("InputImage")
@@ -69,7 +68,7 @@ void Simple(const std::vector<index_t> &input_shape,
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -160,11 +159,11 @@ void Random(const index_t batch,
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                    kernels::BufferType::IN_OUT_CHANNEL);
+                                    ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
-                                    kernels::BufferType::WEIGHT_WIDTH);
+                                    ops::BufferType::WEIGHT_WIDTH);
   BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                                    kernels::BufferType::ARGUMENT);
+                                    ops::BufferType::ARGUMENT);
 
   OpDefBuilder("FullyConnected", "FullyConnectedTest")
       .Input("InputImage")
@@ -178,7 +177,7 @@ void Random(const index_t batch,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
                             1e-1);
diff --git a/mace/kernels/gather.cc b/mace/ops/gather.cc
similarity index 98%
rename from mace/kernels/gather.cc
rename to mace/ops/gather.cc
index ff947e82..f8ceb543 100644
--- a/mace/kernels/gather.cc
+++ b/mace/ops/gather.cc
@@ -17,7 +17,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class GatherOp;
@@ -100,5 +100,5 @@ void RegisterGather(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc
index 8a0cd123..5e52875c 100644
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
@@ -14,7 +14,6 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc
index c716b12a..2da0338b 100644
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -14,7 +14,6 @@
 
 #include <fstream>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/gemm.cc b/mace/ops/gemm.cc
similarity index 99%
rename from mace/kernels/gemm.cc
rename to mace/ops/gemm.cc
index 5043a104..a4d2b8a8 100644
--- a/mace/kernels/gemm.cc
+++ b/mace/ops/gemm.cc
@@ -18,7 +18,7 @@
 
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
-#include "mace/kernels/gemm.h"
+#include "mace/ops/gemm.h"
 
 /**
  * Gemm does fast matrix multiplications with batch.
@@ -40,7 +40,7 @@
 #endif
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 namespace {
 inline void GemmBlock(const float *A,
@@ -1540,5 +1540,5 @@ void Gemv(const float *m_ptr,
 #endif
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/gemm.h b/mace/ops/gemm.h
similarity index 94%
rename from mace/kernels/gemm.h
rename to mace/ops/gemm.h
index 17096bf5..78f044e0 100644
--- a/mace/kernels/gemm.h
+++ b/mace/ops/gemm.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_GEMM_H_
-#define MACE_KERNELS_GEMM_H_
+#ifndef MACE_OPS_GEMM_H_
+#define MACE_OPS_GEMM_H_
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
@@ -25,7 +25,7 @@
 // Gemv function does fast matrix-vector multiplications with batch.
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 // Gemm calculates A[batch, height, K] dot B[batch, K, width] within each batch,
 // and output to C[batch, height, width].
@@ -72,7 +72,7 @@ void Transpose(const float *src,
                index_t stride_w,
                float *dst);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_GEMM_H_
+#endif  // MACE_OPS_GEMM_H_
diff --git a/mace/kernels/gemm_test.cc b/mace/ops/gemm_test.cc
similarity index 84%
rename from mace/kernels/gemm_test.cc
rename to mace/ops/gemm_test.cc
index 0942247d..dcb8483b 100644
--- a/mace/kernels/gemm_test.cc
+++ b/mace/ops/gemm_test.cc
@@ -18,8 +18,8 @@
 #include <random>
 
 #include "mace/core/types.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/gemm.h"
+#include "mace/ops/sgemm.h"
 
 namespace mace {
 
@@ -44,9 +44,9 @@ void GemmTest(index_t batch,
                 [&gen, &nd] { return nd(gen); });
   std::generate(B.get(), B.get() + batch * K * M,
                 [&gen, &nd] { return nd(gen); });
-  kernels::Gemm(A.get(), B.get(), batch, N, K, M, C.get(), transpose_a,
+  ops::Gemm(A.get(), B.get(), batch, N, K, M, C.get(), transpose_a,
                 transpose_b);
-  kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
+  ops::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
                    transpose_b);
 
   for (int i = 0; i < batch * N * M; ++i) {
@@ -66,8 +66,8 @@ void GemvTest(index_t batch, index_t N, index_t M) {
 
   std::generate(A.get(), A.get() + N * M, [&gen, &nd] { return nd(gen); });
   std::generate(B.get(), B.get() + batch * M, [&gen, &nd] { return nd(gen); });
-  kernels::Gemv(A.get(), B.get(), batch, M, N, C.get());
-  kernels::GemvRef(A.get(), B.get(), batch, M, N, C_ref.get());
+  ops::Gemv(A.get(), B.get(), batch, M, N, C.get());
+  ops::GemvRef(A.get(), B.get(), batch, M, N, C_ref.get());
 
   for (int i = 0; i < batch * N; ++i) {
     EXPECT_NEAR(C_ref[i], C[i], 0.1);
@@ -93,48 +93,48 @@ void SGemmTest(index_t batch,
                 [&gen, &nd] { return nd(gen); });
   std::generate(B.get(), B.get() + batch * K * M,
                 [&gen, &nd] { return nd(gen); });
-  kernels::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
+  ops::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
                    transpose_b);
 
-  kernels::MatrixMap<const float> matrix_a;
-  kernels::MatrixMap<const float> matrix_b;
+  ops::MatrixMap<const float> matrix_a;
+  ops::MatrixMap<const float> matrix_b;
 
   if (!transpose_a) {
     matrix_a =
-        kernels::MatrixMap<const float>(batch,
+        ops::MatrixMap<const float>(batch,
                                         N,
                                         K,
-                                        kernels::RowMajor,
+                                        ops::RowMajor,
                                         A.get());
   } else {
     matrix_a =
-        kernels::MatrixMap<const float>(batch,
+        ops::MatrixMap<const float>(batch,
                                         K,
                                         N,
-                                        kernels::RowMajor,
+                                        ops::RowMajor,
                                         A.get());
     matrix_a = matrix_a.transpose();
   }
 
   if (!transpose_b) {
     matrix_b =
-        kernels::MatrixMap<const float>(batch,
+        ops::MatrixMap<const float>(batch,
                                         K,
                                         M,
-                                        kernels::RowMajor,
+                                        ops::RowMajor,
                                         B.get());
   } else {
     matrix_b =
-        kernels::MatrixMap<const float>(batch,
+        ops::MatrixMap<const float>(batch,
                                         M,
                                         K,
-                                        kernels::RowMajor,
+                                        ops::RowMajor,
                                         B.get());
     matrix_b = matrix_b.transpose();
   }
-  kernels::MatrixMap<float> matrix_c(batch, N, M, kernels::RowMajor, C.get());
+  ops::MatrixMap<float> matrix_c(batch, N, M, ops::RowMajor, C.get());
 
-  kernels::SGemm sgemm;
+  ops::SGemm sgemm;
   sgemm(matrix_a, matrix_b, &matrix_c);
 
   for (int i = 0; i < N * M; ++i) {
diff --git a/mace/kernels/gemmlowp_util.h b/mace/ops/gemmlowp_util.h
similarity index 96%
rename from mace/kernels/gemmlowp_util.h
rename to mace/ops/gemmlowp_util.h
index 8a0148e1..e46e6ed8 100644
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/ops/gemmlowp_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_GEMMLOWP_UTIL_H_
-#define MACE_KERNELS_GEMMLOWP_UTIL_H_
+#ifndef MACE_OPS_GEMMLOWP_UTIL_H_
+#define MACE_OPS_GEMMLOWP_UTIL_H_
 
 #include <tuple>
 
@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
 };
 }  // namespace mace
 
-#endif  // MACE_KERNELS_GEMMLOWP_UTIL_H_
+#endif  // MACE_OPS_GEMMLOWP_UTIL_H_
diff --git a/mace/kernels/identity.cc b/mace/ops/identity.cc
similarity index 97%
rename from mace/kernels/identity.cc
rename to mace/ops/identity.cc
index 1fba94bd..54d92e56 100644
--- a/mace/kernels/identity.cc
+++ b/mace/ops/identity.cc
@@ -16,7 +16,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class IdentityOp : public Operation {
@@ -46,5 +46,5 @@ void RegisterIdentity(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/identity_test.cc b/mace/ops/identity_test.cc
index 1ef8848d..3787777b 100644
--- a/mace/ops/identity_test.cc
+++ b/mace/ops/identity_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc
similarity index 97%
rename from mace/kernels/infer_conv2d_shape.cc
rename to mace/ops/infer_conv2d_shape.cc
index 0e80aa61..fbd1d1b9 100644
--- a/mace/kernels/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -14,10 +14,10 @@
 
 
 #include "mace/core/operator.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class InferConv2dShapeOp : public Operation {
@@ -111,5 +111,5 @@ void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc
index 735a599c..ab48817b 100644
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/kernels/local_response_norm.cc b/mace/ops/local_response_norm.cc
similarity index 98%
rename from mace/kernels/local_response_norm.cc
rename to mace/ops/local_response_norm.cc
index 6a51ccb3..16828baa 100644
--- a/mace/kernels/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -17,7 +17,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class LocalResponseNormOp;
@@ -89,5 +89,5 @@ void RegisterLocalResponseNorm(OpRegistryBase *op_registry) {
                    LocalResponseNormOp, DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc
index 893b65d1..b917c495 100644
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc
index 55adcedd..aa3780ca 100644
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/lstm_cell.cc b/mace/ops/lstm_cell.cc
similarity index 95%
rename from mace/kernels/lstm_cell.cc
rename to mace/ops/lstm_cell.cc
index be7f50d9..19abafe0 100644
--- a/mace/kernels/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -16,10 +16,10 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/opencl/image/lstm_cell.h"
+#include "mace/ops/opencl/image/lstm_cell.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class LSTMCellOp;
@@ -66,5 +66,5 @@ void RegisterLSTMCell(OpRegistryBase *op_registry) {
                    DeviceType::GPU, half);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc
index a1972e72..b8840bba 100644
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/lstmcell_test_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -50,15 +48,15 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
                    "PreCellCPU", forget_add, "CellCPU", "OutputCPU");
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "PreOutput", "PreOutputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Weight", "WeightImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
+                        ops::BufferType::ARGUMENT);
     BufferToImage<D, T>(&net, "PreCell", "PreCellImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("LSTMCell", "LSTMCellTest")
         .Input("InputImage")
diff --git a/mace/ops/lstmcell_test.cc b/mace/ops/lstmcell_test.cc
index 5b26c677..99dea59c 100644
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 #include "mace/ops/lstmcell_test_util.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -53,15 +52,15 @@ void TestLSTMCell(const uint32_t &batch,
 
   // Run on GPU
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "PreOutput", "PreOutputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Weight", "WeightImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                      kernels::BufferType::ARGUMENT);
+                      ops::BufferType::ARGUMENT);
   BufferToImage<D, T>(&net, "PreCell", "PreCellImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
 
   OpDefBuilder("LSTMCell", "LSTMCellTest")
       .Input("InputImage")
@@ -78,9 +77,9 @@ void TestLSTMCell(const uint32_t &batch,
   net.RunOp(D);
 
   ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   ImageToBuffer<D, float>(&net, "CellImage", "Cell",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
 
 
   Tensor expected_cell, expected_output;
diff --git a/mace/ops/lstmcell_test_util.h b/mace/ops/lstmcell_test_util.h
index bbd523c9..a7d7a19d 100644
--- a/mace/ops/lstmcell_test_util.h
+++ b/mace/ops/lstmcell_test_util.h
@@ -17,8 +17,7 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -79,7 +78,7 @@ void LSTMCellCPU(OpsTestNet *net,
       .Input("InputSigmoid")
       .Input("NewInputTanh")
       .AddIntArg("T", DataTypeToEnum<T>::v())
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::PROD))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::PROD))
       .Output("RememberMul")
       .Finalize(net->AddNewOperatorDef());
 
@@ -87,7 +86,7 @@ void LSTMCellCPU(OpsTestNet *net,
       .Input("SplitOutput2")
       .AddFloatArg("scalar_input", forget_add_name)
       .AddIntArg("T", DataTypeToEnum<T>::v())
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
       .Output("ForgetAdd")
       .Finalize(net->AddNewOperatorDef());
 
@@ -101,7 +100,7 @@ void LSTMCellCPU(OpsTestNet *net,
       .Input("ForgetSigmoid")
       .Input(pre_cell_name)
       .AddIntArg("T", DataTypeToEnum<T>::v())
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::PROD))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::PROD))
       .Output("ForgetMulPreCell")
       .Finalize(net->AddNewOperatorDef());
 
@@ -109,7 +108,7 @@ void LSTMCellCPU(OpsTestNet *net,
       .Input("RememberMul")
       .Input("ForgetMulPreCell")
       .AddIntArg("T", DataTypeToEnum<T>::v())
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
       .Output(cell_name)
       .Finalize(net->AddNewOperatorDef());
 
@@ -129,7 +128,7 @@ void LSTMCellCPU(OpsTestNet *net,
       .Input("OutputSigmoid")
       .Input("CellTanh")
       .AddIntArg("T", DataTypeToEnum<T>::v())
-      .AddIntArg("type", static_cast<int>(kernels::EltwiseType::PROD))
+      .AddIntArg("type", static_cast<int>(ops::EltwiseType::PROD))
       .Output(output_name)
       .Finalize(net->AddNewOperatorDef());
 }
diff --git a/mace/kernels/matmul.cc b/mace/ops/matmul.cc
similarity index 98%
rename from mace/kernels/matmul.cc
rename to mace/ops/matmul.cc
index 8ef93a29..8608657e 100644
--- a/mace/kernels/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -21,16 +21,16 @@
 
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/gemm.h"
+#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/sgemm.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/matmul.h"
+#include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class MatMulOpBase : public Operation {
  public:
@@ -353,5 +353,5 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index c553e33d..2d62d86a 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -12,16 +12,278 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Eigen/Dense>
+#include <algorithm>
 #include <string>
+#include <tuple>
+#include <vector>
 
-#include "mace/core/op_def_registry.h"
+#include "public/gemmlowp.h"
 #include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/gemm.h"
+#include "mace/ops/sgemm.h"
 #include "mace/ops/ops_test_util.h"
 
+namespace gemmlowp {
+
+template<typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+  typedef MatrixMap<tScalar, tOrder> Map;
+  typedef MatrixMap<const tScalar, tOrder> ConstMap;
+  typedef typename Map::Scalar Scalar;
+  static const MapOrder Order = tOrder;
+  using Map::cols_;
+  using Map::data_;
+  using Map::kOrder;
+  using Map::rows_;
+  using Map::stride_;
+
+ public:
+  Matrix() : Map(nullptr, 0, 0, 0) {}
+
+  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+  Matrix(const Matrix &other) : Map(nullptr, 0, 0, 0) { *this = other; }
+
+  Matrix &operator=(const Matrix &other) {
+    Resize(other.rows_, other.cols_);
+    std::memcpy(data_, other.data_, size() * sizeof(Scalar));
+    return *this;
+  }
+
+  friend bool operator==(const Matrix &a, const Matrix &b) {
+    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+        !std::memcmp(a.data_, b.data_, a.size());
+  }
+
+  void Resize(int rows, int cols) {
+    rows_ = rows;
+    cols_ = cols;
+    stride_ = kOrder == gemmlowp::MapOrder::ColMajor ? rows : cols;
+    storage.resize(size());
+    data_ = storage.data();
+  }
+
+  int size() const { return rows_ * cols_; }
+
+  Map &map() { return *static_cast<Map *>(this); }
+
+  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+  std::vector<Scalar> storage;
+};
+
+template<typename MatrixType>
+void MakeZero(MatrixType *m) {
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = 128;
+    }
+  }
+}
+
+}  // namespace gemmlowp
+
 namespace mace {
 namespace ops {
 namespace test {
 
+// Test the speed of different access order of a NHWC buffer
+
+namespace {
+
+// Matmul with (m, k) x (k, n)
+void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  std::vector<float> lhs(m * k);
+  std::vector<float> rhs(k * n);
+  std::vector<float> result(m * n);
+  // warm up
+  Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
+  mace::testing::StartTiming();
+  while (iters--) {
+    Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
+  }
+}
+
+void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  std::vector<float> lhs(m * k);
+  std::vector<float> rhs(k * n);
+  std::vector<float> result(m * n);
+
+  ops::MatrixMap<const float> matrix_lhs(1, m, k, RowMajor, lhs.data(),
+                                             true);
+  ops::MatrixMap<const float> matrix_rhs(1, k, n, RowMajor, rhs.data(),
+                                             true);
+  ops::MatrixMap<float> matrix_result(1, m, n, RowMajor, result.data());
+
+  ops::SGemm sgemm;
+
+  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
+  }
+}
+
+void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
+  mace::testing::StopTiming();
+  Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
+  Eigen::MatrixXf rhs = Eigen::MatrixXf::Random(k, n);
+  Eigen::MatrixXf result = Eigen::MatrixXf::Zero(m, n);
+  // warm up
+  result = lhs * rhs;
+  mace::testing::StartTiming();
+  while (iters--) {
+    result = lhs * rhs;
+  }
+}
+
+void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
+  mace::testing::StopTiming();
+
+  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
+  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
+  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  gemmlowp::MakeZero(&lhs);
+  gemmlowp::MakeZero(&rhs);
+  gemmlowp::MakeZero(&result);
+
+  gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint quantize_down_stage;
+  quantize_down_stage.result_offset_after_shift = 128;
+  quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
+  quantize_down_stage.result_shift = 16;
+  gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+  const auto output_pipeline =
+      std::make_tuple(quantize_down_stage, saturating_cast_stage);
+
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
+  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, output_pipeline);
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
+                                     BitDepthParams>(
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        -128, output_pipeline);
+  }
+}
+
+void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
+  mace::testing::StopTiming();
+
+  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::RowMajor> lhs;
+  gemmlowp::Matrix<std::uint8_t, gemmlowp::MapOrder::ColMajor> rhs;
+  gemmlowp::Matrix<std::int32_t, gemmlowp::MapOrder::ColMajor> result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  gemmlowp::MakeZero(&lhs);
+  gemmlowp::MakeZero(&rhs);
+  gemmlowp::MakeZero(&result);
+
+  const auto output_pipeline = std::make_tuple();
+
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
+  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, output_pipeline);
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                                     BitDepthParams>(
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        -128, output_pipeline);
+  }
+}
+
+}  // namespace
+
+#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
+  static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
+    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
+    const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
+    mace::testing::MaccProcessed(macc);                            \
+    mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
+    MatmulBenchmark_##FUNC(iters, M, K, N);                        \
+  }                                                                \
+  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
+
+#define MACE_BM_MATMUL(M, K, N)                          \
+  MACE_BM_MATMUL_FUNC(M, K, N, Mace, float);             \
+  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
+  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
+  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
+  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
+
+// Embedding size 384
+MACE_BM_MATMUL(7, 384, 384);
+MACE_BM_MATMUL(7, 384, 1536);
+MACE_BM_MATMUL(7, 1536, 384);
+
+MACE_BM_MATMUL(15, 384, 384);
+MACE_BM_MATMUL(15, 384, 1536);
+MACE_BM_MATMUL(15, 1536, 384);
+
+MACE_BM_MATMUL(1, 256, 256);
+MACE_BM_MATMUL(1, 256, 1536);
+MACE_BM_MATMUL(1, 1536, 256);
+MACE_BM_MATMUL(256, 256, 1);
+MACE_BM_MATMUL(1536, 256, 1);
+MACE_BM_MATMUL(256, 1536, 1);
+MACE_BM_MATMUL(29792, 256, 1);
+MACE_BM_MATMUL(1, 256, 29792);
+MACE_BM_MATMUL(2, 256, 256);
+MACE_BM_MATMUL(2, 256, 1536);
+MACE_BM_MATMUL(2, 1536, 256);
+MACE_BM_MATMUL(3, 256, 256);
+MACE_BM_MATMUL(3, 256, 1536);
+MACE_BM_MATMUL(3, 1536, 256);
+MACE_BM_MATMUL(4, 256, 256);
+MACE_BM_MATMUL(4, 256, 1536);
+MACE_BM_MATMUL(4, 1536, 256);
+MACE_BM_MATMUL(8, 256, 256);
+MACE_BM_MATMUL(8, 256, 1536);
+MACE_BM_MATMUL(8, 1536, 256);
+MACE_BM_MATMUL(10, 256, 256);
+MACE_BM_MATMUL(10, 256, 1536);
+MACE_BM_MATMUL(10, 1536, 256);
+MACE_BM_MATMUL(15, 256, 256);
+MACE_BM_MATMUL(15, 256, 1536);
+MACE_BM_MATMUL(15, 1536, 256);
+
+// Embedding size 128
+MACE_BM_MATMUL(1, 128, 1536);
+MACE_BM_MATMUL(1, 128, 44678);
+
+// MobileNet
+MACE_BM_MATMUL(128, 128, 3136);
+MACE_BM_MATMUL(256, 256, 784);
+MACE_BM_MATMUL(512, 512, 196);
+MACE_BM_MATMUL(1024, 1024, 49);
+
 namespace {
 template <DeviceType D, typename T>
 void MatMulBenchmark(
@@ -41,9 +303,9 @@ void MatMulBenchmark(
   }
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "A", "AImage",
-                        kernels::BufferType::IN_OUT_WIDTH);
+                        ops::BufferType::IN_OUT_WIDTH);
     BufferToImage<D, T>(&net, "B", "BImage",
-                        kernels::BufferType::IN_OUT_HEIGHT);
+                        ops::BufferType::IN_OUT_HEIGHT);
 
     OpDefBuilder("MatMul", "MatMulBM")
         .Input("AImage")
@@ -137,7 +399,7 @@ void MatMulTransposeBenchmark(
   }                                                                            \
   MACE_BENCHMARK(MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
 
-#define MACE_BM_MATMUL(N, H, C, W)                 \
+#define MACE_BM_MATMUL_OP(N, H, C, W)              \
   MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU);    \
   MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU);    \
   MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);     \
@@ -158,17 +420,17 @@ void MatMulTransposeBenchmark(
   MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
   MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
 
-MACE_BM_MATMUL(1, 128, 128, 49);
-MACE_BM_MATMUL(2, 128, 128, 49);
-MACE_BM_MATMUL(3, 128, 128, 49);
-MACE_BM_MATMUL(4, 128, 128, 49);
-MACE_BM_MATMUL(16, 32, 128, 49);
-MACE_BM_MATMUL(16, 32, 128, 961);
-MACE_BM_MATMUL(16, 32, 128, 3969);
-MACE_BM_MATMUL(16, 128, 128, 49);
-MACE_BM_MATMUL(16, 49, 128, 128);
-MACE_BM_MATMUL(16, 128, 128, 961);
-MACE_BM_MATMUL(16, 128, 128, 3969);
+MACE_BM_MATMUL_OP(1, 128, 128, 49);
+MACE_BM_MATMUL_OP(2, 128, 128, 49);
+MACE_BM_MATMUL_OP(3, 128, 128, 49);
+MACE_BM_MATMUL_OP(4, 128, 128, 49);
+MACE_BM_MATMUL_OP(16, 32, 128, 49);
+MACE_BM_MATMUL_OP(16, 32, 128, 961);
+MACE_BM_MATMUL_OP(16, 32, 128, 3969);
+MACE_BM_MATMUL_OP(16, 128, 128, 49);
+MACE_BM_MATMUL_OP(16, 49, 128, 128);
+MACE_BM_MATMUL_OP(16, 128, 128, 961);
+MACE_BM_MATMUL_OP(16, 128, 128, 3969);
 
 MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 49);
 MACE_BM_MATMUL_TRANPOSE(16, 32, 128, 961);
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index e31d8616..d2d95874 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -14,7 +14,6 @@
 
 #include <fstream>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -39,9 +38,9 @@ void Simple(const std::vector<index_t> &A_shape,
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "A", "AImage",
-                            kernels::BufferType::IN_OUT_WIDTH);
+                            ops::BufferType::IN_OUT_WIDTH);
     BufferToImage<D, float>(&net, "B", "BImage",
-                            kernels::BufferType::IN_OUT_HEIGHT);
+                            ops::BufferType::IN_OUT_HEIGHT);
 
     OpDefBuilder("MatMul", "MatMulTest")
         .Input("AImage")
@@ -53,7 +52,7 @@ void Simple(const std::vector<index_t> &A_shape,
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_HEIGHT);
+                            ops::BufferType::IN_OUT_HEIGHT);
   } else {
     OpDefBuilder("MatMul", "MatMulTest")
         .Input("A")
@@ -130,9 +129,9 @@ void Complex(const std::vector<index_t> &batch,
 
   // Run on opencl
   BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
-                                    kernels::BufferType::IN_OUT_WIDTH);
+                                    ops::BufferType::IN_OUT_WIDTH);
   BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
-                                    kernels::BufferType::IN_OUT_HEIGHT);
+                                    ops::BufferType::IN_OUT_HEIGHT);
 
   OpDefBuilder("MatMul", "MatMulTest")
       .Input("AImage")
@@ -144,7 +143,7 @@ void Complex(const std::vector<index_t> &batch,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        kernels::BufferType::IN_OUT_HEIGHT);
+                                        ops::BufferType::IN_OUT_HEIGHT);
 
   // run cpu
   std::vector<index_t> shape_a = batch;
diff --git a/mace/kernels/memory_benchmark.cc b/mace/ops/memory_benchmark.cc
similarity index 98%
rename from mace/kernels/memory_benchmark.cc
rename to mace/ops/memory_benchmark.cc
index 5d9ab1f4..e3bb30a8 100644
--- a/mace/kernels/memory_benchmark.cc
+++ b/mace/ops/memory_benchmark.cc
@@ -19,7 +19,7 @@
 #include "mace/core/testing/test_benchmark.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace test {
 
 // Test the speed of different access order of a NHWC buffer
@@ -107,5 +107,5 @@ MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NHCW);
 MACE_BM_MEMORY_ACCESS(10, 64, 1024, 64, NWCH);
 
 }  // namespace test
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/activation.h b/mace/ops/opencl/activation.h
similarity index 85%
rename from mace/kernels/opencl/activation.h
rename to mace/ops/opencl/activation.h
index 35f1785c..7a20ff16 100644
--- a/mace/kernels/opencl/activation.h
+++ b/mace/ops/opencl/activation.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_ACTIVATION_H_
-#define MACE_KERNELS_OPENCL_ACTIVATION_H_
+#ifndef MACE_OPS_OPENCL_ACTIVATION_H_
+#define MACE_OPS_OPENCL_ACTIVATION_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLActivationKernel {
  public:
   virtual MaceStatus Compute(
@@ -34,7 +34,7 @@ class OpenCLActivationKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLActivationKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_ACTIVATION_H_
+#endif  // MACE_OPS_OPENCL_ACTIVATION_H_
diff --git a/mace/kernels/opencl/addn.h b/mace/ops/opencl/addn.h
similarity index 86%
rename from mace/kernels/opencl/addn.h
rename to mace/ops/opencl/addn.h
index 908ff113..12e8888c 100644
--- a/mace/kernels/opencl/addn.h
+++ b/mace/ops/opencl/addn.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_ADDN_H_
-#define MACE_KERNELS_OPENCL_ADDN_H_
+#ifndef MACE_OPS_OPENCL_ADDN_H_
+#define MACE_OPS_OPENCL_ADDN_H_
 
 #include <vector>
 
@@ -25,7 +25,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 
 class OpenCLAddNKernel {
  public:
@@ -36,7 +36,7 @@ class OpenCLAddNKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLAddNKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_ADDN_H_
+#endif  // MACE_OPS_OPENCL_ADDN_H_
diff --git a/mace/kernels/opencl/batch_norm.h b/mace/ops/opencl/batch_norm.h
similarity index 85%
rename from mace/kernels/opencl/batch_norm.h
rename to mace/ops/opencl/batch_norm.h
index b97dfe6e..da3353d2 100644
--- a/mace/kernels/opencl/batch_norm.h
+++ b/mace/ops/opencl/batch_norm.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BATCH_NORM_H_
-#define MACE_KERNELS_OPENCL_BATCH_NORM_H_
+#ifndef MACE_OPS_OPENCL_BATCH_NORM_H_
+#define MACE_OPS_OPENCL_BATCH_NORM_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLBatchNormKernel {
  public:
   virtual MaceStatus Compute(
@@ -37,7 +37,7 @@ class OpenCLBatchNormKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchNormKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BATCH_NORM_H_
+#endif  // MACE_OPS_OPENCL_BATCH_NORM_H_
diff --git a/mace/kernels/opencl/batch_to_space.h b/mace/ops/opencl/batch_to_space.h
similarity index 86%
rename from mace/kernels/opencl/batch_to_space.h
rename to mace/ops/opencl/batch_to_space.h
index 9f155336..d07e79ee 100644
--- a/mace/kernels/opencl/batch_to_space.h
+++ b/mace/ops/opencl/batch_to_space.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
-#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
+#ifndef MACE_OPS_OPENCL_BATCH_TO_SPACE_H_
+#define MACE_OPS_OPENCL_BATCH_TO_SPACE_H_
 
 #include <vector>
 
@@ -26,7 +26,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLBatchToSpaceKernel {
  public:
   virtual MaceStatus Compute(
@@ -39,7 +39,7 @@ class OpenCLBatchToSpaceKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchToSpaceKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
+#endif  // MACE_OPS_OPENCL_BATCH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/bias_add.h b/mace/ops/opencl/bias_add.h
similarity index 85%
rename from mace/kernels/opencl/bias_add.h
rename to mace/ops/opencl/bias_add.h
index 1a0a1050..67e5b3aa 100644
--- a/mace/kernels/opencl/bias_add.h
+++ b/mace/ops/opencl/bias_add.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BIAS_ADD_H_
-#define MACE_KERNELS_OPENCL_BIAS_ADD_H_
+#ifndef MACE_OPS_OPENCL_BIAS_ADD_H_
+#define MACE_OPS_OPENCL_BIAS_ADD_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLBiasAddKernel {
  public:
   virtual MaceStatus Compute(
@@ -34,7 +34,7 @@ class OpenCLBiasAddKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBiasAddKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BIAS_ADD_H_
+#endif  // MACE_OPS_OPENCL_BIAS_ADD_H_
diff --git a/mace/kernels/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h
similarity index 85%
rename from mace/kernels/opencl/buffer/buffer_inverse_transform.h
rename to mace/ops/opencl/buffer/buffer_inverse_transform.h
index 29e63143..647f2514 100644
--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 
-#include "mace/kernels/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/buffer_inverse_transform.h"
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -65,7 +65,7 @@ MaceStatus BufferInverseTransform<T>::Compute(OpContext *context,
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/kernels/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc
similarity index 98%
rename from mace/kernels/opencl/buffer/buffer_transform.cc
rename to mace/ops/opencl/buffer/buffer_transform.cc
index 7e5897a5..29f467e2 100644
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/buffer/buffer_transform.h"
+#include "mace/ops/opencl/buffer/buffer_transform.h"
 
 #include <vector>
 #include <set>
 #include <string>
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -237,5 +237,5 @@ MaceStatus TransformArgument(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
similarity index 90%
rename from mace/kernels/opencl/buffer/buffer_transform.h
rename to mace/ops/opencl/buffer/buffer_transform.h
index 4a2213e4..4919bb09 100644
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#define MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 
-#include "mace/kernels/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform.h"
 
 #include <vector>
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -101,7 +101,7 @@ MaceStatus BufferTransform<T>::Compute(OpContext *context,
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
diff --git a/mace/kernels/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
similarity index 97%
rename from mace/kernels/opencl/buffer/buffer_type_transform.cc
rename to mace/ops/opencl/buffer/buffer_type_transform.cc
index 4f78f83a..d1d52fe4 100644
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -15,10 +15,10 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -93,5 +93,5 @@ MaceStatus BufferTypeTransform(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h
similarity index 95%
rename from mace/kernels/opencl/buffer/conv_2d.h
rename to mace/ops/opencl/buffer/conv_2d.h
index 8e7ee8b5..e50d8e5c 100644
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -11,20 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
-#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_CONV_2D_H_
+#define MACE_OPS_OPENCL_BUFFER_CONV_2D_H_
 
-#include "mace/kernels/opencl/conv_2d.h"
+#include "mace/ops/opencl/conv_2d.h"
 
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "mace/kernels/opencl/buffer/utils.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/buffer/utils.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 namespace conv2d {
@@ -102,7 +102,7 @@ MaceStatus Conv2dKernel<T>::Compute(
   std::vector<index_t> output_shape(4);
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), filter->shape().data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -211,7 +211,7 @@ MaceStatus Conv2dKernel<T>::Compute(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_CONV_2D_H_
diff --git a/mace/kernels/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
similarity index 97%
rename from mace/kernels/opencl/buffer/conv_2d_1x1.cc
rename to mace/ops/opencl/buffer/conv_2d_1x1.cc
index cbe12466..62e77b17 100644
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -14,11 +14,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 namespace conv2d {
@@ -122,5 +122,5 @@ MaceStatus Conv2d1x1(OpContext *context,
 }  // namespace conv2d
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
similarity index 97%
rename from mace/kernels/opencl/buffer/conv_2d_general.cc
rename to mace/ops/opencl/buffer/conv_2d_general.cc
index 17506a8b..f9cc804d 100644
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -14,11 +14,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 namespace conv2d {
@@ -136,5 +136,5 @@ MaceStatus Conv2dGeneral(OpContext *context,
 }  // namespace conv2d
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.cc b/mace/ops/opencl/buffer/depthwise_conv2d.cc
similarity index 98%
rename from mace/kernels/opencl/buffer/depthwise_conv2d.cc
rename to mace/ops/opencl/buffer/depthwise_conv2d.cc
index 9ff27690..0ba4526c 100644
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer/depthwise_conv2d.h"
 
 #include <set>
 #include <string>
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 namespace depthwise {
@@ -133,5 +133,5 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }  // namespace depthwise
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h
similarity index 94%
rename from mace/kernels/opencl/buffer/depthwise_conv2d.h
rename to mace/ops/opencl/buffer/depthwise_conv2d.h
index b5e26c40..2d6ce0c8 100644
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -11,20 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
-#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#define MACE_OPS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
 
-#include "mace/kernels/opencl/depthwise_conv2d.h"
+#include "mace/ops/opencl/depthwise_conv2d.h"
 
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "mace/kernels/opencl/buffer/utils.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/buffer/utils.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 namespace depthwise {
@@ -95,7 +95,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
   std::vector<index_t> output_shape(4);
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), fake_filter_shape.data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -182,7 +182,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
diff --git a/mace/kernels/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
similarity index 95%
rename from mace/kernels/opencl/buffer/pooling.h
rename to mace/ops/opencl/buffer/pooling.h
index a4433d13..4684d687 100644
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
-#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_POOLING_H_
+#define MACE_OPS_OPENCL_BUFFER_POOLING_H_
 
-#include "mace/kernels/opencl/pooling.h"
+#include "mace/ops/opencl/pooling.h"
 
 #include <functional>
 #include <memory>
@@ -22,11 +22,11 @@
 #include <string>
 #include <vector>
 
-#include "mace/kernels/opencl/buffer/utils.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/buffer/utils.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -76,7 +76,7 @@ MaceStatus PoolingKernel<T>::Compute(
 
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), filter_shape.data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -205,7 +205,7 @@ MaceStatus PoolingKernel<T>::Compute(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_POOLING_H_
diff --git a/mace/kernels/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
similarity index 93%
rename from mace/kernels/opencl/buffer/softmax.h
rename to mace/ops/opencl/buffer/softmax.h
index 502899d8..3147a935 100644
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
-#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_SOFTMAX_H_
+#define MACE_OPS_OPENCL_BUFFER_SOFTMAX_H_
 
-#include "mace/kernels/opencl/softmax.h"
+#include "mace/ops/opencl/softmax.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 template <typename T>
@@ -119,7 +119,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_SOFTMAX_H_
diff --git a/mace/kernels/opencl/buffer/utils.cc b/mace/ops/opencl/buffer/utils.cc
similarity index 96%
rename from mace/kernels/opencl/buffer/utils.cc
rename to mace/ops/opencl/buffer/utils.cc
index a6d5502a..b4214a0a 100644
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/ops/opencl/buffer/utils.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/ops/opencl/buffer/utils.h"
 
 #include <set>
 #include <string>
 #include <vector>
 
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -93,5 +93,5 @@ MaceStatus PadInput(OpContext *context,
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer/utils.h b/mace/ops/opencl/buffer/utils.h
similarity index 86%
rename from mace/kernels/opencl/buffer/utils.h
rename to mace/ops/opencl/buffer/utils.h
index f783691f..c1df8acc 100644
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/ops/opencl/buffer/utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
-#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_UTILS_H_
+#define MACE_OPS_OPENCL_BUFFER_UTILS_H_
 
 #include "mace/core/future.h"
 #include "mace/core/op_context.h"
@@ -21,7 +21,7 @@
 #include "mace/public/mace.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace buffer {
 
@@ -36,6 +36,6 @@ MaceStatus PadInput(OpContext *context,
 
 }  // namespace buffer
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
-#endif  // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_UTILS_H_
diff --git a/mace/kernels/opencl/buffer_inverse_transform.h b/mace/ops/opencl/buffer_inverse_transform.h
similarity index 81%
rename from mace/kernels/opencl/buffer_inverse_transform.h
rename to mace/ops/opencl/buffer_inverse_transform.h
index 0c785910..7f52a64f 100644
--- a/mace/kernels/opencl/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer_inverse_transform.h
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
 
-#include "mace/kernels/opencl/common.h"
+#include "mace/ops/opencl/common.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 
@@ -24,7 +24,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLBufferInverseTransformKernel {
  public:
   virtual MaceStatus Compute(OpContext *context,
@@ -35,7 +35,7 @@ class OpenCLBufferInverseTransformKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/kernels/opencl/buffer_transform.h b/mace/ops/opencl/buffer_transform.h
similarity index 82%
rename from mace/kernels/opencl/buffer_transform.h
rename to mace/ops/opencl/buffer_transform.h
index cc53ef77..45808d40 100644
--- a/mace/kernels/opencl/buffer_transform.h
+++ b/mace/ops/opencl/buffer_transform.h
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
+#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
 
-#include "mace/kernels/opencl/common.h"
+#include "mace/ops/opencl/common.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 
@@ -24,7 +24,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLBufferTransformKernel {
  public:
   virtual MaceStatus Compute(OpContext *context,
@@ -35,7 +35,7 @@ class OpenCLBufferTransformKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
diff --git a/mace/kernels/opencl/channel_shuffle.h b/mace/ops/opencl/channel_shuffle.h
similarity index 83%
rename from mace/kernels/opencl/channel_shuffle.h
rename to mace/ops/opencl/channel_shuffle.h
index 5a5da027..e6243fcd 100644
--- a/mace/kernels/opencl/channel_shuffle.h
+++ b/mace/ops/opencl/channel_shuffle.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
-#define MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
+#ifndef MACE_OPS_OPENCL_CHANNEL_SHUFFLE_H_
+#define MACE_OPS_OPENCL_CHANNEL_SHUFFLE_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLChannelShuffleKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLChannelShuffleKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLChannelShuffleKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
+#endif  // MACE_OPS_OPENCL_CHANNEL_SHUFFLE_H_
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/ops/opencl/cl/activation.cl
similarity index 100%
rename from mace/kernels/opencl/cl/activation.cl
rename to mace/ops/opencl/cl/activation.cl
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/ops/opencl/cl/addn.cl
similarity index 100%
rename from mace/kernels/opencl/cl/addn.cl
rename to mace/ops/opencl/cl/addn.cl
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/ops/opencl/cl/batch_norm.cl
similarity index 100%
rename from mace/kernels/opencl/cl/batch_norm.cl
rename to mace/ops/opencl/cl/batch_norm.cl
diff --git a/mace/kernels/opencl/cl/batch_to_space.cl b/mace/ops/opencl/cl/batch_to_space.cl
similarity index 100%
rename from mace/kernels/opencl/cl/batch_to_space.cl
rename to mace/ops/opencl/cl/batch_to_space.cl
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/ops/opencl/cl/bias_add.cl
similarity index 100%
rename from mace/kernels/opencl/cl/bias_add.cl
rename to mace/ops/opencl/cl/bias_add.cl
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/ops/opencl/cl/buffer_to_image.cl
similarity index 100%
rename from mace/kernels/opencl/cl/buffer_to_image.cl
rename to mace/ops/opencl/cl/buffer_to_image.cl
diff --git a/mace/kernels/opencl/cl/buffer_transform.cl b/mace/ops/opencl/cl/buffer_transform.cl
similarity index 100%
rename from mace/kernels/opencl/cl/buffer_transform.cl
rename to mace/ops/opencl/cl/buffer_transform.cl
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/ops/opencl/cl/channel_shuffle.cl
similarity index 100%
rename from mace/kernels/opencl/cl/channel_shuffle.cl
rename to mace/ops/opencl/cl/channel_shuffle.cl
diff --git a/mace/kernels/opencl/cl/common.h b/mace/ops/opencl/cl/common.h
similarity index 96%
rename from mace/kernels/opencl/cl/common.h
rename to mace/ops/opencl/cl/common.h
index abfdd978..069130d4 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/ops/opencl/cl/common.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_CL_COMMON_H_
-#define MACE_KERNELS_OPENCL_CL_COMMON_H_
+#ifndef MACE_OPS_OPENCL_CL_COMMON_H_
+#define MACE_OPS_OPENCL_CL_COMMON_H_
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
@@ -126,4 +126,4 @@ inline void check_out_of_range_for_buffer(__private const int length,
 }
 
 
-#endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
+#endif  // MACE_OPS_OPENCL_CL_COMMON_H_
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/ops/opencl/cl/concat.cl
similarity index 100%
rename from mace/kernels/opencl/cl/concat.cl
rename to mace/ops/opencl/cl/concat.cl
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/ops/opencl/cl/conv_2d.cl
similarity index 100%
rename from mace/kernels/opencl/cl/conv_2d.cl
rename to mace/ops/opencl/cl/conv_2d.cl
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/ops/opencl/cl/conv_2d_1x1.cl
similarity index 100%
rename from mace/kernels/opencl/cl/conv_2d_1x1.cl
rename to mace/ops/opencl/cl/conv_2d_1x1.cl
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl b/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
similarity index 100%
rename from mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
rename to mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/ops/opencl/cl/conv_2d_3x3.cl
similarity index 100%
rename from mace/kernels/opencl/cl/conv_2d_3x3.cl
rename to mace/ops/opencl/cl/conv_2d_3x3.cl
diff --git a/mace/kernels/opencl/cl/conv_2d_buffer.cl b/mace/ops/opencl/cl/conv_2d_buffer.cl
similarity index 100%
rename from mace/kernels/opencl/cl/conv_2d_buffer.cl
rename to mace/ops/opencl/cl/conv_2d_buffer.cl
diff --git a/mace/kernels/opencl/cl/crop.cl b/mace/ops/opencl/cl/crop.cl
similarity index 100%
rename from mace/kernels/opencl/cl/crop.cl
rename to mace/ops/opencl/cl/crop.cl
diff --git a/mace/kernels/opencl/cl/deconv_2d.cl b/mace/ops/opencl/cl/deconv_2d.cl
similarity index 100%
rename from mace/kernels/opencl/cl/deconv_2d.cl
rename to mace/ops/opencl/cl/deconv_2d.cl
diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/ops/opencl/cl/depth_to_space.cl
similarity index 100%
rename from mace/kernels/opencl/cl/depth_to_space.cl
rename to mace/ops/opencl/cl/depth_to_space.cl
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/ops/opencl/cl/depthwise_conv2d.cl
similarity index 100%
rename from mace/kernels/opencl/cl/depthwise_conv2d.cl
rename to mace/ops/opencl/cl/depthwise_conv2d.cl
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl b/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
similarity index 100%
rename from mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
rename to mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl
similarity index 100%
rename from mace/kernels/opencl/cl/eltwise.cl
rename to mace/ops/opencl/cl/eltwise.cl
diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/ops/opencl/cl/fully_connected.cl
similarity index 100%
rename from mace/kernels/opencl/cl/fully_connected.cl
rename to mace/ops/opencl/cl/fully_connected.cl
diff --git a/mace/kernels/opencl/cl/lstmcell.cl b/mace/ops/opencl/cl/lstmcell.cl
similarity index 100%
rename from mace/kernels/opencl/cl/lstmcell.cl
rename to mace/ops/opencl/cl/lstmcell.cl
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/ops/opencl/cl/matmul.cl
similarity index 100%
rename from mace/kernels/opencl/cl/matmul.cl
rename to mace/ops/opencl/cl/matmul.cl
diff --git a/mace/kernels/opencl/cl/pad.cl b/mace/ops/opencl/cl/pad.cl
similarity index 100%
rename from mace/kernels/opencl/cl/pad.cl
rename to mace/ops/opencl/cl/pad.cl
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/ops/opencl/cl/pooling.cl
similarity index 100%
rename from mace/kernels/opencl/cl/pooling.cl
rename to mace/ops/opencl/cl/pooling.cl
diff --git a/mace/kernels/opencl/cl/pooling_buffer.cl b/mace/ops/opencl/cl/pooling_buffer.cl
similarity index 100%
rename from mace/kernels/opencl/cl/pooling_buffer.cl
rename to mace/ops/opencl/cl/pooling_buffer.cl
diff --git a/mace/kernels/opencl/cl/reduce_mean.cl b/mace/ops/opencl/cl/reduce_mean.cl
similarity index 100%
rename from mace/kernels/opencl/cl/reduce_mean.cl
rename to mace/ops/opencl/cl/reduce_mean.cl
diff --git a/mace/kernels/opencl/cl/resize_bicubic.cl b/mace/ops/opencl/cl/resize_bicubic.cl
similarity index 100%
rename from mace/kernels/opencl/cl/resize_bicubic.cl
rename to mace/ops/opencl/cl/resize_bicubic.cl
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/ops/opencl/cl/resize_bilinear.cl
similarity index 100%
rename from mace/kernels/opencl/cl/resize_bilinear.cl
rename to mace/ops/opencl/cl/resize_bilinear.cl
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/ops/opencl/cl/softmax.cl
similarity index 100%
rename from mace/kernels/opencl/cl/softmax.cl
rename to mace/ops/opencl/cl/softmax.cl
diff --git a/mace/kernels/opencl/cl/softmax_buffer.cl b/mace/ops/opencl/cl/softmax_buffer.cl
similarity index 100%
rename from mace/kernels/opencl/cl/softmax_buffer.cl
rename to mace/ops/opencl/cl/softmax_buffer.cl
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/ops/opencl/cl/space_to_batch.cl
similarity index 100%
rename from mace/kernels/opencl/cl/space_to_batch.cl
rename to mace/ops/opencl/cl/space_to_batch.cl
diff --git a/mace/kernels/opencl/cl/space_to_depth.cl b/mace/ops/opencl/cl/space_to_depth.cl
similarity index 100%
rename from mace/kernels/opencl/cl/space_to_depth.cl
rename to mace/ops/opencl/cl/space_to_depth.cl
diff --git a/mace/kernels/opencl/cl/split.cl b/mace/ops/opencl/cl/split.cl
similarity index 100%
rename from mace/kernels/opencl/cl/split.cl
rename to mace/ops/opencl/cl/split.cl
diff --git a/mace/kernels/opencl/cl/sqrdiff_mean.cl b/mace/ops/opencl/cl/sqrdiff_mean.cl
similarity index 100%
rename from mace/kernels/opencl/cl/sqrdiff_mean.cl
rename to mace/ops/opencl/cl/sqrdiff_mean.cl
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/ops/opencl/cl/winograd_transform.cl
similarity index 100%
rename from mace/kernels/opencl/cl/winograd_transform.cl
rename to mace/ops/opencl/cl/winograd_transform.cl
diff --git a/mace/kernels/opencl/common.h b/mace/ops/opencl/common.h
similarity index 84%
rename from mace/kernels/opencl/common.h
rename to mace/ops/opencl/common.h
index 176f58ed..0a238960 100644
--- a/mace/kernels/opencl/common.h
+++ b/mace/ops/opencl/common.h
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_COMMON_H_
-#define MACE_KERNELS_OPENCL_COMMON_H_
+#ifndef MACE_OPS_OPENCL_COMMON_H_
+#define MACE_OPS_OPENCL_COMMON_H_
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 enum BufferType {
   CONV2D_FILTER = 0,
@@ -30,6 +30,6 @@ enum BufferType {
   WEIGHT_WIDTH = 8,
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
-#endif  // MACE_KERNELS_OPENCL_COMMON_H_
+#endif  // MACE_OPS_OPENCL_COMMON_H_
diff --git a/mace/kernels/opencl/concat.h b/mace/ops/opencl/concat.h
similarity index 86%
rename from mace/kernels/opencl/concat.h
rename to mace/ops/opencl/concat.h
index 78ef14d9..f6b83434 100644
--- a/mace/kernels/opencl/concat.h
+++ b/mace/ops/opencl/concat.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_CONCAT_H_
-#define MACE_KERNELS_OPENCL_CONCAT_H_
+#ifndef MACE_OPS_OPENCL_CONCAT_H_
+#define MACE_OPS_OPENCL_CONCAT_H_
 
 #include <vector>
 
@@ -25,7 +25,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLConcatKernel {
  public:
   virtual MaceStatus Compute(
@@ -35,7 +35,7 @@ class OpenCLConcatKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConcatKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_CONCAT_H_
+#endif  // MACE_OPS_OPENCL_CONCAT_H_
diff --git a/mace/kernels/opencl/conv_2d.h b/mace/ops/opencl/conv_2d.h
similarity index 82%
rename from mace/kernels/opencl/conv_2d.h
rename to mace/ops/opencl/conv_2d.h
index d5ff40ed..cf0911f7 100644
--- a/mace/kernels/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_CONV_2D_H_
-#define MACE_KERNELS_OPENCL_CONV_2D_H_
+#ifndef MACE_OPS_OPENCL_CONV_2D_H_
+#define MACE_OPS_OPENCL_CONV_2D_H_
 
 #include <vector>
 
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
 class OpContext;
 
-namespace kernels {
+namespace ops {
 class OpenCLConv2dKernel {
  public:
   virtual MaceStatus Compute(
@@ -41,7 +41,7 @@ class OpenCLConv2dKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_CONV_2D_H_
+#endif  // MACE_OPS_OPENCL_CONV_2D_H_
diff --git a/mace/kernels/opencl/crop.h b/mace/ops/opencl/crop.h
similarity index 86%
rename from mace/kernels/opencl/crop.h
rename to mace/ops/opencl/crop.h
index d59f67f5..f46f4f12 100644
--- a/mace/kernels/opencl/crop.h
+++ b/mace/ops/opencl/crop.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_CROP_H_
-#define MACE_KERNELS_OPENCL_CROP_H_
+#ifndef MACE_OPS_OPENCL_CROP_H_
+#define MACE_OPS_OPENCL_CROP_H_
 
 #include <vector>
 
@@ -25,7 +25,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLCropKernel {
  public:
   virtual MaceStatus Compute(
@@ -35,7 +35,7 @@ class OpenCLCropKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLCropKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_CROP_H_
+#endif  // MACE_OPS_OPENCL_CROP_H_
diff --git a/mace/kernels/opencl/deconv_2d.h b/mace/ops/opencl/deconv_2d.h
similarity index 84%
rename from mace/kernels/opencl/deconv_2d.h
rename to mace/ops/opencl/deconv_2d.h
index c601acfe..69bc6f97 100644
--- a/mace/kernels/opencl/deconv_2d.h
+++ b/mace/ops/opencl/deconv_2d.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_DECONV_2D_H_
-#define MACE_KERNELS_OPENCL_DECONV_2D_H_
+#ifndef MACE_OPS_OPENCL_DECONV_2D_H_
+#define MACE_OPS_OPENCL_DECONV_2D_H_
 
 #include <vector>
 
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 
 namespace mace {
 
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLDeconv2dKernel {
  public:
   virtual MaceStatus Compute(
@@ -40,7 +40,7 @@ class OpenCLDeconv2dKernel {
       Tensor *output) = 0;
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDeconv2dKernel);
 };
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_DECONV_2D_H_
+#endif  // MACE_OPS_OPENCL_DECONV_2D_H_
diff --git a/mace/kernels/opencl/depth_to_space.h b/mace/ops/opencl/depth_to_space.h
similarity index 83%
rename from mace/kernels/opencl/depth_to_space.h
rename to mace/ops/opencl/depth_to_space.h
index 02585911..c5fce383 100644
--- a/mace/kernels/opencl/depth_to_space.h
+++ b/mace/ops/opencl/depth_to_space.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
-#define MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
+#ifndef MACE_OPS_OPENCL_DEPTH_TO_SPACE_H_
+#define MACE_OPS_OPENCL_DEPTH_TO_SPACE_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -22,7 +22,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 
 class OpenCLDepthToSpaceKernel {
  public:
@@ -33,7 +33,7 @@ class OpenCLDepthToSpaceKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthToSpaceKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
+#endif  // MACE_OPS_OPENCL_DEPTH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/depthwise_conv2d.h b/mace/ops/opencl/depthwise_conv2d.h
similarity index 81%
rename from mace/kernels/opencl/depthwise_conv2d.h
rename to mace/ops/opencl/depthwise_conv2d.h
index 24d08a20..b993e120 100644
--- a/mace/kernels/opencl/depthwise_conv2d.h
+++ b/mace/ops/opencl/depthwise_conv2d.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
-#define MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
+#ifndef MACE_OPS_OPENCL_DEPTHWISE_CONV2D_H_
+#define MACE_OPS_OPENCL_DEPTHWISE_CONV2D_H_
 
 #include <vector>
 
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
 
 class OpContext;
 
-namespace kernels {
+namespace ops {
 class OpenCLDepthwiseConv2dKernel {
  public:
   virtual MaceStatus Compute(
@@ -42,7 +42,7 @@ class OpenCLDepthwiseConv2dKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
+#endif  // MACE_OPS_OPENCL_DEPTHWISE_CONV2D_H_
diff --git a/mace/kernels/opencl/eltwise.h b/mace/ops/opencl/eltwise.h
similarity index 85%
rename from mace/kernels/opencl/eltwise.h
rename to mace/ops/opencl/eltwise.h
index 83a94feb..abd38045 100644
--- a/mace/kernels/opencl/eltwise.h
+++ b/mace/ops/opencl/eltwise.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_ELTWISE_H_
-#define MACE_KERNELS_OPENCL_ELTWISE_H_
+#ifndef MACE_OPS_OPENCL_ELTWISE_H_
+#define MACE_OPS_OPENCL_ELTWISE_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLEltwiseKernel {
  public:
   virtual MaceStatus Compute(
@@ -34,7 +34,7 @@ class OpenCLEltwiseKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLEltwiseKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_ELTWISE_H_
+#endif  // MACE_OPS_OPENCL_ELTWISE_H_
diff --git a/mace/kernels/opencl/fully_connected.h b/mace/ops/opencl/fully_connected.h
similarity index 82%
rename from mace/kernels/opencl/fully_connected.h
rename to mace/ops/opencl/fully_connected.h
index 7982d468..952c5b9c 100644
--- a/mace/kernels/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
-#define MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
+#ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
+#define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
 
-#include "mace/kernels/activation.h"
+#include "mace/ops/activation.h"
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -25,7 +25,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLFullyConnectedKernel {
  public:
   virtual MaceStatus Compute(
@@ -39,7 +39,7 @@ class OpenCLFullyConnectedKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLFullyConnectedKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
+#endif  // MACE_OPS_OPENCL_FULLY_CONNECTED_H_
diff --git a/mace/kernels/opencl/helper.cc b/mace/ops/opencl/helper.cc
similarity index 99%
rename from mace/kernels/opencl/helper.cc
rename to mace/ops/opencl/helper.cc
index aa3daadb..7eb392a8 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 #include <algorithm>
 #include <string>
@@ -22,7 +22,7 @@
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 namespace {
 // [(C + 3) / 4 * W, N * H]
@@ -489,5 +489,5 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
   return MaceStatus::MACE_SUCCESS;
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.h b/mace/ops/opencl/helper.h
similarity index 85%
rename from mace/kernels/opencl/helper.h
rename to mace/ops/opencl/helper.h
index 0126d2ac..d4b5aa51 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_HELPER_H_
-#define MACE_KERNELS_OPENCL_HELPER_H_
+#ifndef MACE_OPS_OPENCL_HELPER_H_
+#define MACE_OPS_OPENCL_HELPER_H_
 
 #include <memory>
 #include <string>
@@ -25,30 +25,30 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
-#include "mace/kernels/opencl/common.h"
+#include "mace/ops/opencl/common.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 // oorc for 'Out Of Range Check'
 #define MACE_OUT_OF_RANGE_DEFINITION           \
   std::shared_ptr<BufferBase> oorc_flag;
 
-#define MACE_OUT_OF_RANGE_CONFIG              \
-  if (runtime->IsOutOfRangeCheckEnabled()) {           \
-    built_options.emplace("-DOUT_OF_RANGE_CHECK");     \
+#define MACE_OUT_OF_RANGE_CONFIG                   \
+  if (runtime->IsOutOfRangeCheckEnabled()) {       \
+    built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
   }
 
-#define MACE_OUT_OF_RANGE_INIT(kernel)                 \
-  if (runtime->IsOutOfRangeCheckEnabled()) {           \
-    oorc_flag = std::move(std::unique_ptr<Buffer>(     \
-        new Buffer((context)->device()->allocator())));         \
-    MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int)));    \
-    oorc_flag->Map(nullptr);                           \
-    *(oorc_flag->mutable_data<int>()) = 0;            \
-    oorc_flag->UnMap();                                \
-    (kernel).setArg(0,                             \
-    *(static_cast<cl::Buffer *>(oorc_flag->buffer())));\
+#define MACE_OUT_OF_RANGE_INIT(kernel)                       \
+  if (runtime->IsOutOfRangeCheckEnabled()) {                 \
+    oorc_flag = std::move(std::unique_ptr<Buffer>(           \
+        new Buffer((context)->device()->allocator())));      \
+    MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int)));\
+    oorc_flag->Map(nullptr);                                 \
+    *(oorc_flag->mutable_data<int>()) = 0;                   \
+    oorc_flag->UnMap();                                      \
+    (kernel).setArg(0,                                       \
+    *(static_cast<cl::Buffer *>(oorc_flag->buffer())));      \
   }
 
 #define MACE_OUT_OF_RANGE_SET_ARGS(kernel)             \
@@ -67,7 +67,7 @@ namespace kernels {
 #define MACE_OUT_OF_RANGE_VALIDATION                                    \
   if (runtime->IsOutOfRangeCheckEnabled()) {                            \
     oorc_flag->Map(nullptr);                                            \
-    int *kerror_code = oorc_flag->mutable_data<int>();                \
+    int *kerror_code = oorc_flag->mutable_data<int>();                  \
     MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code); \
     oorc_flag->UnMap();                                                 \
   }
@@ -173,6 +173,6 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                        const uint32_t *gws,
                                        const uint32_t kwg_size);
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
-#endif  // MACE_KERNELS_OPENCL_HELPER_H_
+#endif  // MACE_OPS_OPENCL_HELPER_H_
diff --git a/mace/kernels/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
similarity index 93%
rename from mace/kernels/opencl/image/activation.h
rename to mace/ops/opencl/image/activation.h
index b1633076..93944b5b 100644
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
-#define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_ACTIVATION_H_
+#define MACE_OPS_OPENCL_IMAGE_ACTIVATION_H_
 
-#include "mace/kernels/opencl/activation.h"
+#include "mace/ops/opencl/activation.h"
 
 #include <memory>
 #include <set>
@@ -23,11 +23,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -142,7 +142,7 @@ MaceStatus ActivationKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_ACTIVATION_H_
diff --git a/mace/kernels/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
similarity index 94%
rename from mace/kernels/opencl/image/addn.h
rename to mace/ops/opencl/image/addn.h
index 8f50d140..bde9c6b0 100644
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
-#define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_ADDN_H_
+#define MACE_OPS_OPENCL_IMAGE_ADDN_H_
 
-#include "mace/kernels/opencl/addn.h"
+#include "mace/ops/opencl/addn.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -129,7 +129,7 @@ MaceStatus AddNKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_ADDN_H_
diff --git a/mace/kernels/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
similarity index 94%
rename from mace/kernels/opencl/image/batch_norm.h
rename to mace/ops/opencl/image/batch_norm.h
index 9414f28b..5685c514 100644
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
-#define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_BATCH_NORM_H_
+#define MACE_OPS_OPENCL_IMAGE_BATCH_NORM_H_
 
-#include "mace/kernels/opencl/batch_norm.h"
+#include "mace/ops/opencl/batch_norm.h"
 
 #include <memory>
 #include <vector>
@@ -23,11 +23,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -156,7 +156,7 @@ MaceStatus BatchNormKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_BATCH_NORM_H_
diff --git a/mace/kernels/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
similarity index 93%
rename from mace/kernels/opencl/image/batch_to_space.h
rename to mace/ops/opencl/image/batch_to_space.h
index 8d984270..b92c9a4e 100644
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
-#define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
+#define MACE_OPS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
 
-#include "mace/kernels/opencl/batch_to_space.h"
+#include "mace/ops/opencl/batch_to_space.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -124,7 +124,7 @@ MaceStatus BatchToSpaceKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
similarity index 93%
rename from mace/kernels/opencl/image/bias_add.h
rename to mace/ops/opencl/image/bias_add.h
index 2180df11..25e2392e 100644
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
-#define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_BIAS_ADD_H_
+#define MACE_OPS_OPENCL_IMAGE_BIAS_ADD_H_
 
-#include "mace/kernels/opencl/bias_add.h"
+#include "mace/ops/opencl/bias_add.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -125,7 +125,7 @@ MaceStatus BiasAddKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_BIAS_ADD_H_
diff --git a/mace/kernels/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
similarity index 95%
rename from mace/kernels/opencl/image/buffer_to_image.h
rename to mace/ops/opencl/image/buffer_to_image.h
index 208c33fa..64319721 100644
--- a/mace/kernels/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
-#define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
+#define MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 
-#include "mace/kernels/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform.h"
 
 #include <set>
 #include <string>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -204,7 +204,7 @@ MaceStatus BufferToImage<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
diff --git a/mace/kernels/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
similarity index 93%
rename from mace/kernels/opencl/image/channel_shuffle.h
rename to mace/ops/opencl/image/channel_shuffle.h
index 8d351c0a..53acbf15 100644
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
-#define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
+#define MACE_OPS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
 
-#include "mace/kernels/opencl/channel_shuffle.h"
+#include "mace/ops/opencl/channel_shuffle.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -115,7 +115,7 @@ MaceStatus ChannelShuffleKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
diff --git a/mace/kernels/opencl/image/concat.cc b/mace/ops/opencl/image/concat.cc
similarity index 98%
rename from mace/kernels/opencl/image/concat.cc
rename to mace/ops/opencl/image/concat.cc
index 9fc6dd48..aab72c54 100644
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/image/concat.h"
+#include "mace/ops/opencl/image/concat.h"
 
 #include <algorithm>
 #include <set>
 #include <string>
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace concat {
@@ -207,5 +207,5 @@ MaceStatus ConcatN(OpContext *context,
 }  // namespace concat
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/image/concat.h b/mace/ops/opencl/image/concat.h
similarity index 93%
rename from mace/kernels/opencl/image/concat.h
rename to mace/ops/opencl/image/concat.h
index 4041cc3e..f12ad25c 100644
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -11,20 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
-#define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_CONCAT_H_
+#define MACE_OPS_OPENCL_IMAGE_CONCAT_H_
 
-#include "mace/kernels/opencl/concat.h"
+#include "mace/ops/opencl/concat.h"
 
 #include <memory>
 #include <vector>
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace concat {
@@ -114,7 +114,7 @@ MaceStatus ConcatKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_CONCAT_H_
diff --git a/mace/kernels/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
similarity index 95%
rename from mace/kernels/opencl/image/conv_2d.h
rename to mace/ops/opencl/image/conv_2d.h
index 415beac4..224432e8 100644
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -11,20 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
-#define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_CONV_2D_H_
+#define MACE_OPS_OPENCL_IMAGE_CONV_2D_H_
 
-#include "mace/kernels/opencl/conv_2d.h"
+#include "mace/ops/opencl/conv_2d.h"
 
 #include <memory>
 #include <vector>
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -137,7 +137,7 @@ MaceStatus Conv2dKernel<T>::Compute(
   std::vector<index_t> output_shape(4);
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), filter->shape().data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -170,7 +170,7 @@ MaceStatus Conv2dKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_CONV_2D_H_
diff --git a/mace/kernels/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
similarity index 98%
rename from mace/kernels/opencl/image/conv_2d_1x1.cc
rename to mace/ops/opencl/image/conv_2d_1x1.cc
index 36f8ba34..74a7ddc9 100644
--- a/mace/kernels/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -14,11 +14,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -176,5 +176,5 @@ extern MaceStatus Conv2dOpenclK1x1(OpContext *context,
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
similarity index 98%
rename from mace/kernels/opencl/image/conv_2d_3x3.cc
rename to mace/ops/opencl/image/conv_2d_3x3.cc
index f2f94c03..42a2a81e 100644
--- a/mace/kernels/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -14,12 +14,12 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/opencl/helper.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -163,5 +163,5 @@ extern MaceStatus Conv2dOpenclK3x3(OpContext *context,
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
similarity index 98%
rename from mace/kernels/opencl/image/conv_2d_general.cc
rename to mace/ops/opencl/image/conv_2d_general.cc
index 8221814e..9b577c2b 100644
--- a/mace/kernels/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -14,12 +14,12 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/kernels/activation.h"
+#include "mace/ops/opencl/helper.h"
+#include "mace/ops/activation.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -176,5 +176,5 @@ extern MaceStatus Conv2dOpencl(OpContext *context,
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
similarity index 96%
rename from mace/kernels/opencl/image/crop.h
rename to mace/ops/opencl/image/crop.h
index 7ab8ce1c..95a989a1 100644
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_
-#define MACE_KERNELS_OPENCL_IMAGE_CROP_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_CROP_H_
+#define MACE_OPS_OPENCL_IMAGE_CROP_H_
 
-#include "mace/kernels/opencl/crop.h"
+#include "mace/ops/opencl/crop.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -188,7 +188,7 @@ MaceStatus CropKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_CROP_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_CROP_H_
diff --git a/mace/kernels/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
similarity index 96%
rename from mace/kernels/opencl/image/deconv_2d.h
rename to mace/ops/opencl/image/deconv_2d.h
index eae5978a..c4dfa2bf 100644
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
-#define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_DECONV_2D_H_
+#define MACE_OPS_OPENCL_IMAGE_DECONV_2D_H_
 
-#include "mace/kernels/opencl/deconv_2d.h"
+#include "mace/ops/opencl/deconv_2d.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -182,7 +182,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_DECONV_2D_H_
diff --git a/mace/kernels/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
similarity index 94%
rename from mace/kernels/opencl/image/depth_to_space.h
rename to mace/ops/opencl/image/depth_to_space.h
index 0a961d53..2ed253df 100644
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
-#define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
+#define MACE_OPS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
 
-#include "mace/kernels/opencl/depth_to_space.h"
+#include "mace/ops/opencl/depth_to_space.h"
 
 #include <memory>
 #include <vector>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -138,7 +138,7 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/image/depthwise_conv2d.cc b/mace/ops/opencl/image/depthwise_conv2d.cc
similarity index 98%
rename from mace/kernels/opencl/image/depthwise_conv2d.cc
rename to mace/ops/opencl/image/depthwise_conv2d.cc
index 57953960..02409ebe 100644
--- a/mace/kernels/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
 
 #include <algorithm>
 #include <set>
 #include <string>
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace depthwise {
@@ -189,5 +189,5 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }  // namespace depthwise
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h
similarity index 92%
rename from mace/kernels/opencl/image/depthwise_conv2d.h
rename to mace/ops/opencl/image/depthwise_conv2d.h
index 7bfa9ede..e818b039 100644
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -11,20 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
-#define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
+#define MACE_OPS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
 
-#include "mace/kernels/opencl/depthwise_conv2d.h"
+#include "mace/ops/opencl/depthwise_conv2d.h"
 
 #include <memory>
 #include <vector>
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace depthwise {
@@ -101,7 +101,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
   std::vector<index_t> output_shape(4);
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), fake_filter_shape.data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -124,7 +124,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
diff --git a/mace/kernels/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
similarity index 95%
rename from mace/kernels/opencl/image/eltwise.h
rename to mace/ops/opencl/image/eltwise.h
index d2352602..25235a44 100644
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
-#define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_ELTWISE_H_
+#define MACE_OPS_OPENCL_IMAGE_ELTWISE_H_
 
-#include "mace/kernels/opencl/eltwise.h"
+#include "mace/ops/opencl/eltwise.h"
 
 #include <memory>
 #include <utility>
@@ -24,11 +24,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/eltwise.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/eltwise.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -185,7 +185,7 @@ MaceStatus EltwiseKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_ELTWISE_H_
diff --git a/mace/kernels/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
similarity index 95%
rename from mace/kernels/opencl/image/fully_connected.h
rename to mace/ops/opencl/image/fully_connected.h
index 605c9ee9..2d8fbb88 100644
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
-#define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_FULLY_CONNECTED_H_
+#define MACE_OPS_OPENCL_IMAGE_FULLY_CONNECTED_H_
 
-#include "mace/kernels/opencl/fully_connected.h"
+#include "mace/ops/opencl/fully_connected.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -184,7 +184,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_FULLY_CONNECTED_H_
diff --git a/mace/kernels/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
similarity index 95%
rename from mace/kernels/opencl/image/image_to_buffer.h
rename to mace/ops/opencl/image/image_to_buffer.h
index da8667f0..9aa65f0e 100644
--- a/mace/kernels/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
-#define MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
+#define MACE_OPS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
 
 #include <set>
 #include <string>
 #include <vector>
 
 #include "mace/core/op_context.h"
-#include "mace/kernels/opencl/buffer_inverse_transform.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -185,7 +185,7 @@ MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
diff --git a/mace/kernels/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
similarity index 94%
rename from mace/kernels/opencl/image/lstm_cell.h
rename to mace/ops/opencl/image/lstm_cell.h
index 00b07356..967c4bf4 100644
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
-#define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_LSTM_CELL_H_
+#define MACE_OPS_OPENCL_IMAGE_LSTM_CELL_H_
 
-#include "mace/kernels/opencl/lstm_cell.h"
+#include "mace/ops/opencl/lstm_cell.h"
 
 #include <memory>
 #include <vector>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -135,7 +135,7 @@ MaceStatus LSTMCellKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_LSTM_CELL_H_
diff --git a/mace/kernels/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
similarity index 93%
rename from mace/kernels/opencl/image/matmul.h
rename to mace/ops/opencl/image/matmul.h
index aa688646..899df5a5 100644
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
-#define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_MATMUL_H_
+#define MACE_OPS_OPENCL_IMAGE_MATMUL_H_
 
-#include "mace/kernels/opencl/matmul.h"
+#include "mace/ops/opencl/matmul.h"
 
 #include <functional>
 #include <memory>
@@ -24,10 +24,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -122,7 +122,7 @@ MaceStatus MatMulKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_MATMUL_H_
diff --git a/mace/kernels/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
similarity index 94%
rename from mace/kernels/opencl/image/pad.h
rename to mace/ops/opencl/image/pad.h
index b9673e9e..c96d964a 100644
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_
-#define MACE_KERNELS_OPENCL_IMAGE_PAD_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_PAD_H_
+#define MACE_OPS_OPENCL_IMAGE_PAD_H_
 
-#include "mace/kernels/opencl/pad.h"
+#include "mace/ops/opencl/pad.h"
 
 #include <memory>
 #include <vector>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -130,7 +130,7 @@ MaceStatus PadKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_PAD_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_PAD_H_
diff --git a/mace/kernels/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
similarity index 95%
rename from mace/kernels/opencl/image/pooling.h
rename to mace/ops/opencl/image/pooling.h
index 769f3cf8..1384b54b 100644
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
-#define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_POOLING_H_
+#define MACE_OPS_OPENCL_IMAGE_POOLING_H_
 
-#include "mace/kernels/opencl/pooling.h"
+#include "mace/ops/opencl/pooling.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,10 +24,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace pooling {
@@ -97,7 +97,7 @@ MaceStatus PoolingKernel<T>::Compute(
 
   std::vector<int> paddings(2);
   if (padding_data.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input->shape().data(), filter_shape.data(), dilations, strides,
         padding_type, output_shape.data(), paddings.data());
   } else {
@@ -181,7 +181,7 @@ MaceStatus PoolingKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_POOLING_H_
diff --git a/mace/kernels/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h
similarity index 95%
rename from mace/kernels/opencl/image/reduce_mean.h
rename to mace/ops/opencl/image/reduce_mean.h
index 7d7c5fba..ca5daa5f 100644
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
-#define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_REDUCE_MEAN_H_
+#define MACE_OPS_OPENCL_IMAGE_REDUCE_MEAN_H_
 
-#include "mace/kernels/opencl/reduce_mean.h"
+#include "mace/ops/opencl/reduce_mean.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -171,7 +171,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_REDUCE_MEAN_H_
diff --git a/mace/kernels/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
similarity index 91%
rename from mace/kernels/opencl/image/resize_bicubic.h
rename to mace/ops/opencl/image/resize_bicubic.h
index 20d062ac..e801e59f 100644
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
-#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
+#define MACE_OPS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
 
-#include "mace/kernels/opencl/resize_bicubic.h"
+#include "mace/ops/opencl/resize_bicubic.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,11 +24,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/kernels/resize_bicubic.h"
+#include "mace/ops/opencl/helper.h"
+#include "mace/ops/resize_bicubic.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace resize_bicubic {
@@ -116,7 +116,7 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
     built_options.emplace(
         MakeString("-DTABLE_SIZE=",
-                   mace::kernels::resize_bicubic::kTableSize));
+                   mace::ops::resize_bicubic::kTableSize));
     MACE_RETURN_IF_ERROR(
         runtime->BuildKernel("resize_bicubic",
                              kernel_name,
@@ -138,10 +138,10 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
-        mace::kernels::resize_bicubic::CalculateResizeScale(
+        mace::ops::resize_bicubic::CalculateResizeScale(
             in_height, out_height, align_corners_);
     float width_scale =
-        mace::kernels::resize_bicubic::CalculateResizeScale(
+        mace::ops::resize_bicubic::CalculateResizeScale(
             in_width, out_width, align_corners_);
 
     uint32_t idx = 0;
@@ -172,7 +172,7 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
diff --git a/mace/kernels/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
similarity index 92%
rename from mace/kernels/opencl/image/resize_bilinear.h
rename to mace/ops/opencl/image/resize_bilinear.h
index d34b7d50..7af9a5f6 100644
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
-#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
+#define MACE_OPS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
 
-#include "mace/kernels/opencl/resize_bilinear.h"
+#include "mace/ops/opencl/resize_bilinear.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,11 +24,11 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/kernels/resize_bilinear.h"
+#include "mace/ops/opencl/helper.h"
+#include "mace/ops/resize_bilinear.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace resize_bilinear {
@@ -139,11 +139,11 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
-        mace::kernels::resize_bilinear::CalculateResizeScale(in_height,
+        mace::ops::resize_bilinear::CalculateResizeScale(in_height,
                                                              out_height,
                                                              align_corners_);
     float width_scale =
-        mace::kernels::resize_bilinear::CalculateResizeScale(in_width,
+        mace::ops::resize_bilinear::CalculateResizeScale(in_width,
                                                              out_width,
                                                              align_corners_);
 
@@ -175,7 +175,7 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
diff --git a/mace/kernels/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
similarity index 94%
rename from mace/kernels/opencl/image/softmax.h
rename to mace/ops/opencl/image/softmax.h
index cf2dd5b4..ffd5ec89 100644
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
-#define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_SOFTMAX_H_
+#define MACE_OPS_OPENCL_IMAGE_SOFTMAX_H_
 
-#include "mace/kernels/opencl/softmax.h"
+#include "mace/ops/opencl/softmax.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,10 +24,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 namespace softmax {
@@ -145,7 +145,7 @@ MaceStatus SoftmaxKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_SOFTMAX_H_
diff --git a/mace/kernels/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
similarity index 93%
rename from mace/kernels/opencl/image/space_to_batch.h
rename to mace/ops/opencl/image/space_to_batch.h
index 0a20e6f6..9924f02f 100644
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
-#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
+#define MACE_OPS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
 
-#include "mace/kernels/opencl/space_to_batch.h"
+#include "mace/ops/opencl/space_to_batch.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -122,7 +122,7 @@ MaceStatus SpaceToBatchKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
diff --git a/mace/kernels/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
similarity index 94%
rename from mace/kernels/opencl/image/space_to_depth.h
rename to mace/ops/opencl/image/space_to_depth.h
index 2e3f2a74..961d1606 100644
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
-#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
+#define MACE_OPS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
 
-#include "mace/kernels/opencl/space_to_depth.h"
+#include "mace/ops/opencl/space_to_depth.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -132,7 +132,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
diff --git a/mace/kernels/opencl/image/split.h b/mace/ops/opencl/image/split.h
similarity index 95%
rename from mace/kernels/opencl/image/split.h
rename to mace/ops/opencl/image/split.h
index ee7fab71..12286a6d 100644
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
-#define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_SPLIT_H_
+#define MACE_OPS_OPENCL_IMAGE_SPLIT_H_
 
-#include "mace/kernels/opencl/split.h"
+#include "mace/ops/opencl/split.h"
 
 #include <algorithm>
 #include <memory>
@@ -24,10 +24,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -146,7 +146,7 @@ MaceStatus SplitKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_SPLIT_H_
diff --git a/mace/kernels/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
similarity index 95%
rename from mace/kernels/opencl/image/sqrdiff_mean.h
rename to mace/ops/opencl/image/sqrdiff_mean.h
index 3d86b05d..791566f8 100644
--- a/mace/kernels/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
-#define MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
+#define MACE_OPS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
 
-#include "mace/kernels/opencl/sqrdiff_mean.h"
+#include "mace/ops/opencl/sqrdiff_mean.h"
 
 #include <memory>
 #include <set>
@@ -23,10 +23,10 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -168,7 +168,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
 
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
diff --git a/mace/kernels/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h
similarity index 96%
rename from mace/kernels/opencl/image/winograd_transform.h
rename to mace/ops/opencl/image/winograd_transform.h
index f00e5556..2d9b6c0a 100644
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/ops/opencl/image/winograd_transform.h
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
+#define MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
 
-#include "mace/kernels/opencl/winograd_transform.h"
+#include "mace/ops/opencl/winograd_transform.h"
 
 #include <memory>
 #include <vector>
@@ -23,12 +23,12 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace opencl {
 namespace image {
 
@@ -103,7 +103,7 @@ MaceStatus WinogradTransformKernel<T>::Compute(
   std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3};
   std::vector<int> paddings(2);
   if (paddings_.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    ops::CalcNHWCPaddingAndOutputSize(
         input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
         strides_.data(), padding_type_, output_shape.data(), paddings.data());
   } else {
@@ -319,7 +319,7 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
 }
 }  // namespace image
 }  // namespace opencl
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
diff --git a/mace/kernels/opencl/lstm_cell.h b/mace/ops/opencl/lstm_cell.h
similarity index 86%
rename from mace/kernels/opencl/lstm_cell.h
rename to mace/ops/opencl/lstm_cell.h
index 0ce1d26f..611201be 100644
--- a/mace/kernels/opencl/lstm_cell.h
+++ b/mace/ops/opencl/lstm_cell.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_LSTM_CELL_H_
-#define MACE_KERNELS_OPENCL_LSTM_CELL_H_
+#ifndef MACE_OPS_OPENCL_LSTM_CELL_H_
+#define MACE_OPS_OPENCL_LSTM_CELL_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLLSTMCellKernel {
  public:
   virtual MaceStatus Compute(
@@ -38,7 +38,7 @@ class OpenCLLSTMCellKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLLSTMCellKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_LSTM_CELL_H_
+#endif  // MACE_OPS_OPENCL_LSTM_CELL_H_
diff --git a/mace/kernels/opencl/matmul.h b/mace/ops/opencl/matmul.h
similarity index 86%
rename from mace/kernels/opencl/matmul.h
rename to mace/ops/opencl/matmul.h
index e971328e..b23a0c40 100644
--- a/mace/kernels/opencl/matmul.h
+++ b/mace/ops/opencl/matmul.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_MATMUL_H_
-#define MACE_KERNELS_OPENCL_MATMUL_H_
+#ifndef MACE_OPS_OPENCL_MATMUL_H_
+#define MACE_OPS_OPENCL_MATMUL_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLMatMulKernel {
  public:
   virtual MaceStatus Compute(
@@ -36,7 +36,7 @@ class OpenCLMatMulKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLMatMulKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_MATMUL_H_
+#endif  // MACE_OPS_OPENCL_MATMUL_H_
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc
similarity index 98%
rename from mace/kernels/opencl/out_of_range_check_test.cc
rename to mace/ops/opencl/out_of_range_check_test.cc
index 957026b2..f63d1db0 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
@@ -21,10 +21,10 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/ops/opencl/helper.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace {
 
 MaceStatus BufferToImageOpImpl(OpContext *context,
@@ -160,5 +160,5 @@ TEST(OutOfRangeCheckTest, RandomTest) {
                   != MaceStatus::MACE_SUCCESS);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/opencl/pad.h b/mace/ops/opencl/pad.h
similarity index 86%
rename from mace/kernels/opencl/pad.h
rename to mace/ops/opencl/pad.h
index ec91a446..9c68cee0 100644
--- a/mace/kernels/opencl/pad.h
+++ b/mace/ops/opencl/pad.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_PAD_H_
-#define MACE_KERNELS_OPENCL_PAD_H_
+#ifndef MACE_OPS_OPENCL_PAD_H_
+#define MACE_OPS_OPENCL_PAD_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -22,7 +22,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLPadKernel {
  public:
   virtual MaceStatus Compute(
@@ -32,7 +32,7 @@ class OpenCLPadKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPadKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_PAD_H_
+#endif  // MACE_OPS_OPENCL_PAD_H_
diff --git a/mace/kernels/opencl/pooling.h b/mace/ops/opencl/pooling.h
similarity index 82%
rename from mace/kernels/opencl/pooling.h
rename to mace/ops/opencl/pooling.h
index ce3c8b54..fc41a474 100644
--- a/mace/kernels/opencl/pooling.h
+++ b/mace/ops/opencl/pooling.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_POOLING_H_
-#define MACE_KERNELS_OPENCL_POOLING_H_
+#ifndef MACE_OPS_OPENCL_POOLING_H_
+#define MACE_OPS_OPENCL_POOLING_H_
 
 #include <vector>
 
-#include "mace/kernels/pooling.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/pooling.h"
+#include "mace/ops/conv_pool_2d_util.h"
 
 namespace mace {
 
 class OpContext;
 class Tensor;
-namespace kernels {
+namespace ops {
 class OpenCLPoolingKernel {
  public:
   virtual MaceStatus Compute(
@@ -40,7 +40,7 @@ class OpenCLPoolingKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPoolingKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_POOLING_H_
+#endif  // MACE_OPS_OPENCL_POOLING_H_
diff --git a/mace/kernels/opencl/reduce_mean.h b/mace/ops/opencl/reduce_mean.h
similarity index 84%
rename from mace/kernels/opencl/reduce_mean.h
rename to mace/ops/opencl/reduce_mean.h
index 1960aac5..9e279a2a 100644
--- a/mace/kernels/opencl/reduce_mean.h
+++ b/mace/ops/opencl/reduce_mean.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
-#define MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
+#ifndef MACE_OPS_OPENCL_REDUCE_MEAN_H_
+#define MACE_OPS_OPENCL_REDUCE_MEAN_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLReduceMeanKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLReduceMeanKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLReduceMeanKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
+#endif  // MACE_OPS_OPENCL_REDUCE_MEAN_H_
diff --git a/mace/kernels/opencl/resize_bicubic.h b/mace/ops/opencl/resize_bicubic.h
similarity index 84%
rename from mace/kernels/opencl/resize_bicubic.h
rename to mace/ops/opencl/resize_bicubic.h
index bfb6f8b5..2d7d3be1 100644
--- a/mace/kernels/opencl/resize_bicubic.h
+++ b/mace/ops/opencl/resize_bicubic.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
-#define MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_OPENCL_RESIZE_BICUBIC_H_
+#define MACE_OPS_OPENCL_RESIZE_BICUBIC_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -24,7 +24,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLResizeBicubicKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLResizeBicubicKernel {
       Tensor *output) = 0;
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBicubicKernel);
 };
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_OPENCL_RESIZE_BICUBIC_H_
diff --git a/mace/kernels/opencl/resize_bilinear.h b/mace/ops/opencl/resize_bilinear.h
similarity index 84%
rename from mace/kernels/opencl/resize_bilinear.h
rename to mace/ops/opencl/resize_bilinear.h
index f60fb282..37d60a72 100644
--- a/mace/kernels/opencl/resize_bilinear.h
+++ b/mace/ops/opencl/resize_bilinear.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
-#define MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
+#ifndef MACE_OPS_OPENCL_RESIZE_BILINEAR_H_
+#define MACE_OPS_OPENCL_RESIZE_BILINEAR_H_
 
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
@@ -24,7 +24,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLResizeBilinearKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLResizeBilinearKernel {
       Tensor *output) = 0;
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBilinearKernel);
 };
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
+#endif  // MACE_OPS_OPENCL_RESIZE_BILINEAR_H_
diff --git a/mace/kernels/opencl/softmax.h b/mace/ops/opencl/softmax.h
similarity index 85%
rename from mace/kernels/opencl/softmax.h
rename to mace/ops/opencl/softmax.h
index 308b606e..958cae36 100644
--- a/mace/kernels/opencl/softmax.h
+++ b/mace/ops/opencl/softmax.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_SOFTMAX_H_
-#define MACE_KERNELS_OPENCL_SOFTMAX_H_
+#ifndef MACE_OPS_OPENCL_SOFTMAX_H_
+#define MACE_OPS_OPENCL_SOFTMAX_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLSoftmaxKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLSoftmaxKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSoftmaxKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_SOFTMAX_H_
+#endif  // MACE_OPS_OPENCL_SOFTMAX_H_
diff --git a/mace/kernels/opencl/space_to_batch.h b/mace/ops/opencl/space_to_batch.h
similarity index 86%
rename from mace/kernels/opencl/space_to_batch.h
rename to mace/ops/opencl/space_to_batch.h
index 22d308ac..0b530ab6 100644
--- a/mace/kernels/opencl/space_to_batch.h
+++ b/mace/ops/opencl/space_to_batch.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
-#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
+#ifndef MACE_OPS_OPENCL_SPACE_TO_BATCH_H_
+#define MACE_OPS_OPENCL_SPACE_TO_BATCH_H_
 
 #include <vector>
 
@@ -26,7 +26,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLSpaceToBatchKernel {
  public:
   virtual MaceStatus Compute(
@@ -39,7 +39,7 @@ class OpenCLSpaceToBatchKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToBatchKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
+#endif  // MACE_OPS_OPENCL_SPACE_TO_BATCH_H_
diff --git a/mace/kernels/opencl/space_to_depth.h b/mace/ops/opencl/space_to_depth.h
similarity index 83%
rename from mace/kernels/opencl/space_to_depth.h
rename to mace/ops/opencl/space_to_depth.h
index ea6b16c1..44d04975 100644
--- a/mace/kernels/opencl/space_to_depth.h
+++ b/mace/ops/opencl/space_to_depth.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
-#define MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
+#ifndef MACE_OPS_OPENCL_SPACE_TO_DEPTH_H_
+#define MACE_OPS_OPENCL_SPACE_TO_DEPTH_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -23,7 +23,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLSpaceToDepthKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLSpaceToDepthKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToDepthKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
+#endif  // MACE_OPS_OPENCL_SPACE_TO_DEPTH_H_
diff --git a/mace/kernels/opencl/split.h b/mace/ops/opencl/split.h
similarity index 86%
rename from mace/kernels/opencl/split.h
rename to mace/ops/opencl/split.h
index c5cacd6f..3daae7d7 100644
--- a/mace/kernels/opencl/split.h
+++ b/mace/ops/opencl/split.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_SPLIT_H_
-#define MACE_KERNELS_OPENCL_SPLIT_H_
+#ifndef MACE_OPS_OPENCL_SPLIT_H_
+#define MACE_OPS_OPENCL_SPLIT_H_
 
 #include <vector>
 
@@ -25,7 +25,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLSplitKernel {
  public:
   virtual MaceStatus Compute(
@@ -35,7 +35,7 @@ class OpenCLSplitKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSplitKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_SPLIT_H_
+#endif  // MACE_OPS_OPENCL_SPLIT_H_
diff --git a/mace/kernels/opencl/sqrdiff_mean.h b/mace/ops/opencl/sqrdiff_mean.h
similarity index 84%
rename from mace/kernels/opencl/sqrdiff_mean.h
rename to mace/ops/opencl/sqrdiff_mean.h
index c2d5d197..2814aa6d 100644
--- a/mace/kernels/opencl/sqrdiff_mean.h
+++ b/mace/ops/opencl/sqrdiff_mean.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
-#define MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
+#ifndef MACE_OPS_OPENCL_SQRDIFF_MEAN_H_
+#define MACE_OPS_OPENCL_SQRDIFF_MEAN_H_
 
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -22,7 +22,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 class OpenCLSqrDiffMeanKernel {
  public:
   virtual MaceStatus Compute(
@@ -33,7 +33,7 @@ class OpenCLSqrDiffMeanKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSqrDiffMeanKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
+#endif  // MACE_OPS_OPENCL_SQRDIFF_MEAN_H_
diff --git a/mace/kernels/opencl/winograd_transform.h b/mace/ops/opencl/winograd_transform.h
similarity index 86%
rename from mace/kernels/opencl/winograd_transform.h
rename to mace/ops/opencl/winograd_transform.h
index d706e89b..f150481a 100644
--- a/mace/kernels/opencl/winograd_transform.h
+++ b/mace/ops/opencl/winograd_transform.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
-#define MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
+#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
 
 #include <vector>
 
@@ -24,7 +24,7 @@ namespace mace {
 class OpContext;
 class Tensor;
 
-namespace kernels {
+namespace ops {
 
 class OpenCLWinogradTransformKernel {
  public:
@@ -44,7 +44,7 @@ class OpenCLWinogradInverseTransformKernel {
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
diff --git a/mace/ops/ops_def_register.cc b/mace/ops/ops_def_register.cc
deleted file mode 100644
index 46ee5184..00000000
--- a/mace/ops/ops_def_register.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/ops_def_register.h"
-
-#include <vector>
-
-namespace mace {
-namespace ops {
-
-void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Activation")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("AddN")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ArgMax")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("BatchNorm")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("BatchToSpaceND")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("BiasAdd")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("BufferInverseTransform")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("BufferTransform")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Cast")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ChannelShuffle")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Concat")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Conv2D")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Crop")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Deconv2D")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("DepthToSpace")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("DepthwiseConv2d")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Dequantize")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Eltwise")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ExpandDims")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Fill")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("FullyConnected")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Gather")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Identity")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("InferConv2dShape")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("LocalResponseNorm")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("LSTMCell")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("MatMul")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Pad")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Pooling")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Quantize")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ReduceMean")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Reshape")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ResizeBicubic")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ResizeBilinear")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Reverse")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("ScalarMath")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Shape")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Softmax")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("SpaceToBatchND")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("SpaceToDepth")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Split")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("SqrDiffMean")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Squeeze")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Stack")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("StridedSlice")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Transpose")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("Unstack")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::CPU, DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("WinogradInverseTransform")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::GPU};
-          }));
-
-  MACE_REGISTER_OP_DEF(
-      op_def_registry,
-      OpRegistrationBuilder("WinogradTransform")
-          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
-            return {DeviceType::GPU};
-          }));
-}
-}  // namespace ops
-
-
-OpDefRegistry::OpDefRegistry() : OpDefRegistryBase() {
-  ops::RegisterOpDefs(this);
-}
-
-}  // namespace mace
diff --git a/mace/ops/ops_def_register.h b/mace/ops/ops_def_register.h
deleted file mode 100644
index 5b2d6acb..00000000
--- a/mace/ops/ops_def_register.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_OPS_DEF_REGISTER_H_
-#define MACE_OPS_OPS_DEF_REGISTER_H_
-
-#include "mace/core/op_def_registry.h"
-
-namespace mace {
-
-class OpDefRegistry : public OpDefRegistryBase {
- public:
-  OpDefRegistry();
-  ~OpDefRegistry() override = default;
-};
-
-}  // namespace mace
-
-#endif  // MACE_OPS_OPS_DEF_REGISTER_H_
diff --git a/mace/kernels/ops_register.cc b/mace/ops/ops_registry.cc
similarity index 67%
rename from mace/kernels/ops_register.cc
rename to mace/ops/ops_registry.cc
index 4dba8910..e330d66a 100644
--- a/mace/kernels/ops_register.cc
+++ b/mace/ops/ops_registry.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/ops_register.h"
+#include "mace/ops/ops_registry.h"
 
 namespace mace {
 
-namespace kernels {
+namespace ops {
 // Keep in lexicographical order
 extern void RegisterActivation(OpRegistryBase *op_registry);
 extern void RegisterAddN(OpRegistryBase *op_registry);
@@ -62,70 +62,70 @@ extern void RegisterStack(OpRegistryBase *op_registry);
 extern void RegisterStridedSlice(OpRegistryBase *op_registry);
 extern void RegisterTranspose(OpRegistryBase *op_registry);
 extern void RegisterUnstack(OpRegistryBase *op_registry);
+
 #ifdef MACE_ENABLE_OPENCL
 extern void RegisterBufferTransform(OpRegistryBase *op_registry);
 extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry);
 extern void RegisterLSTMCell(OpRegistryBase *op_registry);
 extern void RegisterWinogradInverseTransform(OpRegistryBase *op_registry);
 extern void RegisterWinogradTransform(OpRegistryBase *op_registry);
-
 #endif  // MACE_ENABLE_OPENCL
-}  // namespace kernels
+}  // namespace ops
 
 
 OpRegistry::OpRegistry() : OpRegistryBase() {
   // Keep in lexicographical order
-  kernels::RegisterActivation(this);
-  kernels::RegisterAddN(this);
-  kernels::RegisterArgMax(this);
-  kernels::RegisterBatchNorm(this);
-  kernels::RegisterBatchToSpaceND(this);
-  kernels::RegisterBiasAdd(this);
-  kernels::RegisterCast(this);
-  kernels::RegisterChannelShuffle(this);
-  kernels::RegisterConcat(this);
-  kernels::RegisterConv2D(this);
-  kernels::RegisterCrop(this);
-  kernels::RegisterDeconv2D(this);
-  kernels::RegisterDepthToSpace(this);
-  kernels::RegisterDepthwiseConv2d(this);
-  kernels::RegisterDequantize(this);
-  kernels::RegisterEltwise(this);
-  kernels::RegisterExpandDims(this);
-  kernels::RegisterFill(this);
-  kernels::RegisterFullyConnected(this);
-  kernels::RegisterGather(this);
-  kernels::RegisterIdentity(this);
-  kernels::RegisterInferConv2dShape(this);
-  kernels::RegisterLocalResponseNorm(this);
-  kernels::RegisterMatMul(this);
-  kernels::RegisterPad(this);
-  kernels::RegisterPooling(this);
-  kernels::RegisterQuantize(this);
-  kernels::RegisterReduceMean(this);
-  kernels::RegisterReshape(this);
-  kernels::RegisterResizeBicubic(this);
-  kernels::RegisterResizeBilinear(this);
-  kernels::RegisterReverse(this);
-  kernels::RegisterScalarMath(this);
-  kernels::RegisterShape(this);
-  kernels::RegisterSoftmax(this);
-  kernels::RegisterSpaceToBatchND(this);
-  kernels::RegisterSpaceToDepth(this);
-  kernels::RegisterSplit(this);
-  kernels::RegisterStack(this);
-  kernels::RegisterStridedSlice(this);
-  kernels::RegisterSqrDiffMean(this);
-  kernels::RegisterSqueeze(this);
-  kernels::RegisterTranspose(this);
-  kernels::RegisterUnstack(this);
-#ifdef MACE_ENABLE_OPENCL
-  kernels::RegisterBufferTransform(this);
-  kernels::RegisterBufferInverseTransform(this);
-  kernels::RegisterLSTMCell(this);
-  kernels::RegisterWinogradInverseTransform(this);
-  kernels::RegisterWinogradTransform(this);
+  ops::RegisterActivation(this);
+  ops::RegisterAddN(this);
+  ops::RegisterArgMax(this);
+  ops::RegisterBatchNorm(this);
+  ops::RegisterBatchToSpaceND(this);
+  ops::RegisterBiasAdd(this);
+  ops::RegisterCast(this);
+  ops::RegisterChannelShuffle(this);
+  ops::RegisterConcat(this);
+  ops::RegisterConv2D(this);
+  ops::RegisterCrop(this);
+  ops::RegisterDeconv2D(this);
+  ops::RegisterDepthToSpace(this);
+  ops::RegisterDepthwiseConv2d(this);
+  ops::RegisterDequantize(this);
+  ops::RegisterEltwise(this);
+  ops::RegisterExpandDims(this);
+  ops::RegisterFill(this);
+  ops::RegisterFullyConnected(this);
+  ops::RegisterGather(this);
+  ops::RegisterIdentity(this);
+  ops::RegisterInferConv2dShape(this);
+  ops::RegisterLocalResponseNorm(this);
+  ops::RegisterMatMul(this);
+  ops::RegisterPad(this);
+  ops::RegisterPooling(this);
+  ops::RegisterQuantize(this);
+  ops::RegisterReduceMean(this);
+  ops::RegisterReshape(this);
+  ops::RegisterResizeBicubic(this);
+  ops::RegisterResizeBilinear(this);
+  ops::RegisterReverse(this);
+  ops::RegisterScalarMath(this);
+  ops::RegisterShape(this);
+  ops::RegisterSoftmax(this);
+  ops::RegisterSpaceToBatchND(this);
+  ops::RegisterSpaceToDepth(this);
+  ops::RegisterSplit(this);
+  ops::RegisterStack(this);
+  ops::RegisterStridedSlice(this);
+  ops::RegisterSqrDiffMean(this);
+  ops::RegisterSqueeze(this);
+  ops::RegisterTranspose(this);
+  ops::RegisterUnstack(this);
 
+#ifdef MACE_ENABLE_OPENCL
+  ops::RegisterBufferTransform(this);
+  ops::RegisterBufferInverseTransform(this);
+  ops::RegisterLSTMCell(this);
+  ops::RegisterWinogradInverseTransform(this);
+  ops::RegisterWinogradTransform(this);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/kernels/ops_register.h b/mace/ops/ops_registry.h
similarity index 87%
rename from mace/kernels/ops_register.h
rename to mace/ops/ops_registry.h
index e3576adb..beb17c37 100644
--- a/mace/kernels/ops_register.h
+++ b/mace/ops/ops_registry.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_OPS_REGISTER_H_
-#define MACE_KERNELS_OPS_REGISTER_H_
+#ifndef MACE_OPS_OPS_REGISTRY_H_
+#define MACE_OPS_OPS_REGISTRY_H_
 
 #include "mace/core/operator.h"
 
@@ -27,4 +27,4 @@ class OpRegistry : public OpRegistryBase {
 
 }  // namespace mace
 
-#endif  // MACE_KERNELS_OPS_REGISTER_H_
+#endif  // MACE_OPS_OPS_REGISTRY_H_
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 4823bd80..5bf842f3 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -31,9 +31,8 @@
 #include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/kernels/opencl/common.h"
-#include "mace/kernels/ops_register.h"
-#include "mace/ops/ops_def_register.h"
+#include "mace/ops/opencl/common.h"
+#include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 #include "mace/utils/quantize.h"
@@ -141,7 +140,6 @@ class OpTestContext {
 class OpsTestNet {
  public:
   OpsTestNet() :
-    op_def_registry_(new OpDefRegistry()),
     op_registry_(new OpRegistry()) {}
 
   template <DeviceType D, typename T>
@@ -455,10 +453,8 @@ class OpsTestNet {
     NetDef net_def;
     for (auto &op_def_ : op_defs_) {
       net_def.add_op()->CopyFrom(op_def_);
-      net_def.add_op_types(op_def_.type());
     }
     net_ = std::unique_ptr<NetBase>(new SerialNet(
-        op_def_registry_.get(),
         op_registry_.get(),
         &net_def,
         &ws_,
@@ -502,7 +498,6 @@ class OpsTestNet {
   MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
     device_type_ = device;
     auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_def_registry_.get(),
         op_registry_.get(),
         &net_def,
         &ws_,
@@ -511,7 +506,6 @@ class OpsTestNet {
     MACE_RETURN_IF_ERROR(net->Init());
     MACE_RETURN_IF_ERROR(net->Run());
     net_ = std::unique_ptr<NetBase>(new SerialNet(
-        op_def_registry_.get(),
         op_registry_.get(),
         &net_def,
         &ws_,
@@ -538,7 +532,6 @@ class OpsTestNet {
   }
 
  public:
-  std::shared_ptr<OpDefRegistryBase> op_def_registry_;
   std::shared_ptr<OpRegistryBase> op_registry_;
   Workspace ws_;
   std::vector<OperatorDef> op_defs_;
@@ -784,7 +777,7 @@ template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet *net,
                    const std::string &input_name,
                    const std::string &output_name,
-                   const kernels::BufferType type,
+                   const ops::BufferType type,
                    const int wino_block_size = 2) {
   MACE_CHECK_NOTNULL(net);
 
@@ -806,7 +799,7 @@ template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet *net,
                    const std::string &input_name,
                    const std::string &output_name,
-                   const kernels::BufferType type,
+                   const ops::BufferType type,
                    const int wino_block_size = 2) {
   MACE_CHECK_NOTNULL(net);
 
diff --git a/mace/kernels/pad.cc b/mace/ops/pad.cc
similarity index 98%
rename from mace/kernels/pad.cc
rename to mace/ops/pad.cc
index 9024eb0f..60bfbc19 100644
--- a/mace/kernels/pad.cc
+++ b/mace/ops/pad.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/pad.h"
+#include "mace/ops/opencl/image/pad.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class PadOp;
@@ -126,5 +126,5 @@ void RegisterPad(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index ad8a1254..e295d2ad 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -35,7 +33,7 @@ void Pad(int iters, int batch, int height,
   const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Pad", "PadTest")
         .Input("InputImage")
         .Output("OutputImage")
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index a8c2267f..f0eece25 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -31,7 +30,7 @@ void Simple() {
   net.AddRepeatedInput<D, float>("Input", {1, 2, 3, 1}, 2);
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Pad", "PadTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -43,7 +42,7 @@ void Simple() {
     net.RunOp(D);
 
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
                                                     NCHW);
@@ -138,7 +137,7 @@ void Complex(const std::vector<index_t> &input_shape,
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                    kernels::BufferType::IN_OUT_CHANNEL);
+                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pad", "PadTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -150,7 +149,7 @@ void Complex(const std::vector<index_t> &input_shape,
   net.RunOp(DeviceType::GPU);
 
   ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
-                                        kernels::BufferType::IN_OUT_CHANNEL);
+                                        ops::BufferType::IN_OUT_CHANNEL);
 
   auto output = net.GetTensor("OpenCLOutput");
 
diff --git a/mace/kernels/pooling.cc b/mace/ops/pooling.cc
similarity index 98%
rename from mace/kernels/pooling.cc
rename to mace/ops/pooling.cc
index 07d41d11..e1800b87 100644
--- a/mace/kernels/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/kernels/pooling.h"
+#include "mace/ops/pooling.h"
 
 #include <algorithm>
 #include <limits>
@@ -26,15 +26,15 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/conv_pool_2d_base.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_base.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/pooling.h"
-#include "mace/kernels/opencl/buffer/pooling.h"
+#include "mace/ops/opencl/image/pooling.h"
+#include "mace/ops/opencl/buffer/pooling.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class PoolingOpBase : public ConvPool2dOpBase {
  public:
@@ -72,7 +72,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
 
     std::vector<int> paddings(2);
     if (paddings_.empty()) {
-      kernels::CalcNCHWPaddingAndOutputSize(
+      ops::CalcNCHWPaddingAndOutputSize(
           input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
           strides_.data(), padding_type_, output_shape.data(), paddings.data());
     } else {
@@ -463,5 +463,5 @@ void RegisterPooling(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/pooling.h b/mace/ops/pooling.h
similarity index 87%
rename from mace/kernels/pooling.h
rename to mace/ops/pooling.h
index 9780907c..b974e826 100644
--- a/mace/kernels/pooling.h
+++ b/mace/ops/pooling.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_POOLING_H_
-#define MACE_KERNELS_POOLING_H_
+#ifndef MACE_OPS_POOLING_H_
+#define MACE_OPS_POOLING_H_
 
 
 namespace mace {
@@ -23,4 +23,4 @@ enum PoolingType {
 };
 }  // namespace mace
 
-#endif  // MACE_KERNELS_POOLING_H_
+#endif  // MACE_OPS_POOLING_H_
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index e5199001..ff915ec0 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/pooling.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/pooling.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -66,7 +65,7 @@ void Pooling(int iters,
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 4cd432d5..99691db8 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
+#include <vector>
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/pooling.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/pooling.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -192,7 +191,7 @@ void SimpleMaxPooling3S2() {
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Pooling", "PoolingTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -204,7 +203,7 @@ void SimpleMaxPooling3S2() {
         .Finalize(net.NewOperatorDef());
     net.RunOp(D);
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -252,7 +251,7 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -265,7 +264,7 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
       .Finalize(net.NewOperatorDef());
   net.RunOp(D);
   ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
@@ -351,7 +350,7 @@ void SimpleAvgPoolingTest() {
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
   BufferToImage<D, float>(&net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -364,7 +363,7 @@ void SimpleAvgPoolingTest() {
   // Run
   net.RunOp(D);
   ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
@@ -410,7 +409,7 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -423,7 +422,7 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
       .Finalize(net.NewOperatorDef());
   net.RunOp(D);
   ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
diff --git a/mace/kernels/quantize.cc b/mace/ops/quantize.cc
similarity index 98%
rename from mace/kernels/quantize.cc
rename to mace/ops/quantize.cc
index 2fd9e7c3..3b2ea301 100644
--- a/mace/kernels/quantize.cc
+++ b/mace/ops/quantize.cc
@@ -22,7 +22,7 @@
 #include "mace/utils/quantize.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class QuantizeOp;
@@ -107,5 +107,5 @@ void RegisterDequantize(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,
                    DeviceType::CPU, int32_t);
 }
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/quantize_test.cc b/mace/ops/quantize_test.cc
index 207ab4e4..e889f852 100644
--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/reduce_mean.cc b/mace/ops/reduce_mean.cc
similarity index 98%
rename from mace/kernels/reduce_mean.cc
rename to mace/ops/reduce_mean.cc
index d103125b..0857eb3e 100644
--- a/mace/kernels/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -20,11 +20,11 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/reduce_mean.h"
+#include "mace/ops/opencl/image/reduce_mean.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class ReduceMeanOpBase : public Operation {
  public:
@@ -271,5 +271,5 @@ void RegisterReduceMean(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/reduce_mean_benchmark.cc b/mace/ops/reduce_mean_benchmark.cc
index 02f6d447..24338ce7 100644
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -33,7 +31,7 @@ void ReduceMean(int iters, int batch, int channels,
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("ReduceMean", "ReduceMeanBM")
         .Input("InputImage")
         .AddIntsArg("axis", {1, 2})
diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc
index 24ff7a4a..bc2577e2 100644
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -45,7 +44,7 @@ void Simple(const std::vector<index_t> &input_shape,
     net.RunOp(D);
   } else {
     BufferToImage<D, float>(&net, "Input", "InputImg",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("ReduceMean", "ReduceMeanTest")
         .Input("InputImg")
         .AddIntsArg("axis", axis)
@@ -55,7 +54,7 @@ void Simple(const std::vector<index_t> &input_shape,
     // Run
     net.RunOp(D);
     ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(output_shape, output);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
@@ -362,7 +361,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                   "Output", NHWC);
   BufferToImage<D, T>(&net, "Input", "InputImg",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("ReduceMean", "ReduceMeanTest")
       .Input("InputImg")
       .AddIntsArg("axis", axis)
@@ -372,7 +371,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
   // Run
   net.RunOp(D);
   ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
                             *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
diff --git a/mace/kernels/reshape.cc b/mace/ops/reshape.cc
similarity index 98%
rename from mace/kernels/reshape.cc
rename to mace/ops/reshape.cc
index 2cfef42b..400d1cff 100644
--- a/mace/kernels/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -17,7 +17,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class ReshapeOp : public Operation {
@@ -84,5 +84,5 @@ void RegisterReshape(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/reshape_test.cc b/mace/ops/reshape_test.cc
index bdc7ab97..686975fe 100644
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
similarity index 98%
rename from mace/kernels/resize_bicubic.cc
rename to mace/ops/resize_bicubic.cc
index fe0512ff..28912fae 100644
--- a/mace/kernels/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/resize_bicubic.h"
+#include "mace/ops/resize_bicubic.h"
 
 #include <algorithm>
 #include <memory>
@@ -20,11 +20,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/resize_bicubic.h"
+#include "mace/ops/opencl/image/resize_bicubic.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 inline const std::shared_ptr<float> InitCoeffsTable() {
   // Allocate and initialize coefficients table using Bicubic
@@ -230,5 +230,5 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/resize_bicubic.h b/mace/ops/resize_bicubic.h
similarity index 87%
rename from mace/kernels/resize_bicubic.h
rename to mace/ops/resize_bicubic.h
index 5e02edd4..b53f112b 100644
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_RESIZE_BICUBIC_H_
-#define MACE_KERNELS_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_RESIZE_BICUBIC_H_
+#define MACE_OPS_RESIZE_BICUBIC_H_
 
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace resize_bicubic {
 constexpr int64_t kTableSize = (1u << 10);
 
@@ -31,7 +31,7 @@ inline float CalculateResizeScale(index_t in_size,
 }
 }  // namespace resize_bicubic
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_RESIZE_BICUBIC_H_
diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc
index f0847e4c..896fb1e0 100644
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <string>
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -57,7 +56,7 @@ void ResizeBicubicBenchmark(int iters,
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark")
         .Input("InputImage")
         .Input("OutSize")
diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc
index 8dc1dbf7..3a33eefc 100644
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -153,7 +152,7 @@ void TestRandomResizeBicubic() {
 
     if (D == DeviceType::GPU) {
       BufferToImage<D, float>(&net, "Input", "InputImage",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
 
       OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
           .Input("InputImage")
@@ -165,7 +164,7 @@ void TestRandomResizeBicubic() {
       net.RunOp(D);
 
       ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
     }
     // Check
     ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
diff --git a/mace/kernels/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
similarity index 98%
rename from mace/kernels/resize_bilinear.cc
rename to mace/ops/resize_bilinear.cc
index 8ea86158..4b3f5a09 100644
--- a/mace/kernels/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/resize_bilinear.h"
+#include "mace/ops/resize_bilinear.h"
 
 #include <algorithm>
 #include <memory>
@@ -21,11 +21,11 @@
 #include "mace/core/operator.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/resize_bilinear.h"
+#include "mace/ops/opencl/image/resize_bilinear.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 struct CachedInterpolation {
   index_t lower;  // Lower source index used in the interpolation
@@ -367,5 +367,5 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/resize_bilinear.h b/mace/ops/resize_bilinear.h
similarity index 86%
rename from mace/kernels/resize_bilinear.h
rename to mace/ops/resize_bilinear.h
index 1f94e500..cf0d32b4 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
-#define MACE_KERNELS_RESIZE_BILINEAR_H_
+#ifndef MACE_OPS_RESIZE_BILINEAR_H_
+#define MACE_OPS_RESIZE_BILINEAR_H_
 
 #include "mace/core/types.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace resize_bilinear {
 inline float CalculateResizeScale(index_t in_size,
                                   index_t out_size,
@@ -28,7 +28,7 @@ inline float CalculateResizeScale(index_t in_size,
          : in_size / static_cast<float>(out_size);
 }
 }  // namespace resize_bilinear
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_RESIZE_BILINEAR_H_
+#endif  // MACE_OPS_RESIZE_BILINEAR_H_
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index 2fd6b6c2..345f86bb 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <string>
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -64,7 +63,7 @@ void ResizeBilinearBenchmark(int iters,
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
         .Input("InputImage")
         .Input("OutSize")
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 3ff5372a..b611854f 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -120,7 +119,7 @@ void TestRandomResizeBilinear() {
 
     if (D == DeviceType::GPU) {
       BufferToImage<D, float>(&net, "Input", "InputImage",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
 
       OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
           .Input("InputImage")
@@ -132,7 +131,7 @@ void TestRandomResizeBilinear() {
       net.RunOp(D);
 
       ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
     }
     // Check
     ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
diff --git a/mace/kernels/reverse.cc b/mace/ops/reverse.cc
similarity index 98%
rename from mace/kernels/reverse.cc
rename to mace/ops/reverse.cc
index f73db418..64d4d44d 100644
--- a/mace/kernels/reverse.cc
+++ b/mace/ops/reverse.cc
@@ -15,7 +15,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class ReverseOp;
@@ -77,5 +77,5 @@ void RegisterReverse(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc
index 40f2f908..9630f696 100644
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/reverse_test.cc b/mace/ops/reverse_test.cc
index 282214fd..9b827461 100644
--- a/mace/ops/reverse_test.cc
+++ b/mace/ops/reverse_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/scalar_math.cc b/mace/ops/scalar_math.cc
similarity index 95%
rename from mace/kernels/scalar_math.cc
rename to mace/ops/scalar_math.cc
index f9f4822a..4a866ae7 100644
--- a/mace/kernels/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -16,10 +16,10 @@
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <typename T, typename DstType>
 void ScalarEltwise(const T* in0,
@@ -88,8 +88,8 @@ class ScalarMathOp : public Operation {
  public:
   explicit ScalarMathOp(OpConstructContext *context)
       : Operation(context),
-        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
-            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        type_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE)))),
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
@@ -160,5 +160,5 @@ void RegisterScalarMath(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc
index 99caa07d..b9d8fd0b 100644
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/kernels/eltwise.h"
+#include "mace/ops/eltwise.h"
 
 namespace mace {
 namespace ops {
@@ -24,7 +23,7 @@ class ScalarMathOpTest : public OpsTestBase {};
 
 namespace {
 template <DeviceType D, typename T, typename DstType>
-void ScalarMathTest(const kernels::EltwiseType type,
+void ScalarMathTest(const ops::EltwiseType type,
                     const T input0,
                     const T input1,
                     const float x,
@@ -42,7 +41,7 @@ void ScalarMathTest(const kernels::EltwiseType type,
       .AddIntArg("T", DataTypeToEnum<T>::v())
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatArg("scalar_input", x)
-      .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+      .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
       .Output("Output")
       .Finalize(net.NewOperatorDef());
   // Run
@@ -57,52 +56,52 @@ void ScalarMathTest(const kernels::EltwiseType type,
 
 TEST_F(ScalarMathOpTest, SimpleCPU) {
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUM, 1, 2, 3, 3);
+      ops::EltwiseType::SUM, 1, 2, 3, 3);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SUB, 1, 2, 3, -1);
+      ops::EltwiseType::SUB, 1, 2, 3, -1);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::PROD, 3, -2, 3, -6);
+      ops::EltwiseType::PROD, 3, -2, 3, -6);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+      ops::EltwiseType::DIV, 3, -2, 1, -1.5);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MIN, 3, -2, 1, -2);
+      ops::EltwiseType::MIN, 3, -2, 1, -2);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::MAX, 3, -2, 1, 3);
+      ops::EltwiseType::MAX, 3, -2, 1, 3);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::NEG, 3, -2, 1, -3);
+      ops::EltwiseType::NEG, 3, -2, 1, -3);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::ABS, 3, -2, 1, 3);
+      ops::EltwiseType::ABS, 3, -2, 1, 3);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+      ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
   ScalarMathTest<DeviceType::CPU, float, float>(
-      kernels::EltwiseType::POW, 3, 1, 1, 3);
+      ops::EltwiseType::POW, 3, 1, 1, 3);
   ScalarMathTest<DeviceType::CPU, float, int32_t>(
-      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+      ops::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 
 TEST_F(ScalarMathOpTest, SimpleGPU) {
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUM, 1, 2, 1, 3);
+      ops::EltwiseType::SUM, 1, 2, 1, 3);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SUB, 1, 2, 1, -1);
+      ops::EltwiseType::SUB, 1, 2, 1, -1);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::PROD, 3, -2, 1, -6);
+      ops::EltwiseType::PROD, 3, -2, 1, -6);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+      ops::EltwiseType::DIV, 3, -2, 1, -1.5);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MIN, 3, -2, 1, -2);
+      ops::EltwiseType::MIN, 3, -2, 1, -2);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::MAX, 3, -2, 1, 3);
+      ops::EltwiseType::MAX, 3, -2, 1, 3);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::NEG, 3, -2, 1, -3);
+      ops::EltwiseType::NEG, 3, -2, 1, -3);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::ABS, 3, -2, 1, 3);
+      ops::EltwiseType::ABS, 3, -2, 1, 3);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+      ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
   ScalarMathTest<DeviceType::GPU, float, float>(
-      kernels::EltwiseType::POW, 3, 1, 1, 3);
+      ops::EltwiseType::POW, 3, 1, 1, 3);
   ScalarMathTest<DeviceType::GPU, float, int32_t>(
-      kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+      ops::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 }  // namespace test
 }  // namespace ops
diff --git a/mace/kernels/sgemm.cc b/mace/ops/sgemm.cc
similarity index 99%
rename from mace/kernels/sgemm.cc
rename to mace/ops/sgemm.cc
index 6b20256d..2edb6fe3 100644
--- a/mace/kernels/sgemm.cc
+++ b/mace/ops/sgemm.cc
@@ -14,7 +14,7 @@
 
 #include <memory>
 
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 
 
@@ -27,7 +27,7 @@
 #endif
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 void SGemm::operator()(const MatrixMap<const float> &lhs,
                        const MatrixMap<const float> &rhs,
@@ -123,14 +123,14 @@ void SGemm::Run(const float *A,
       MatrixMap<const float>(batch,
                              height_a,
                              width_a,
-                             kernels::RowMajor,
+                             ops::RowMajor,
                              A,
                              is_a_weight);
   MatrixMap<const float> matrix_b =
-      kernels::MatrixMap<const float>(batch,
+      ops::MatrixMap<const float>(batch,
                                       height_b,
                                       width_b,
-                                      kernels::RowMajor,
+                                      ops::RowMajor,
                                       B,
                                       is_b_weight);
   if (transpose_a) {
@@ -139,7 +139,7 @@ void SGemm::Run(const float *A,
   if (transpose_b) {
     matrix_b = matrix_b.transpose();
   }
-  MatrixMap<float> matrix_c(batch, height_c, width_c, kernels::RowMajor, C);
+  MatrixMap<float> matrix_c(batch, height_c, width_c, ops::RowMajor, C);
   operator()(matrix_a, matrix_b, &matrix_c, scratch_buffer);
 }
 
@@ -1167,5 +1167,5 @@ void SGemm::UnPackPerBatch(const float *packed_data,
   }
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/kernels/sgemm.h b/mace/ops/sgemm.h
similarity index 96%
rename from mace/kernels/sgemm.h
rename to mace/ops/sgemm.h
index 92a185db..f233820f 100644
--- a/mace/kernels/sgemm.h
+++ b/mace/ops/sgemm.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SGEMM_H_
-#define MACE_KERNELS_SGEMM_H_
+#ifndef MACE_OPS_SGEMM_H_
+#define MACE_OPS_SGEMM_H_
 
 #include <memory>
 #include <utility>
@@ -27,7 +27,7 @@
 #include "mace/core/tensor.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 enum Major {
   RowMajor,
@@ -178,7 +178,7 @@ class SGemm {
   bool packed_;
 };
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_KERNELS_SGEMM_H_
+#endif  // MACE_OPS_SGEMM_H_
diff --git a/mace/kernels/sgemm_pack_test.cc b/mace/ops/sgemm_pack_test.cc
similarity index 98%
rename from mace/kernels/sgemm_pack_test.cc
rename to mace/ops/sgemm_pack_test.cc
index 3e7aaa98..3c0f9a20 100644
--- a/mace/kernels/sgemm_pack_test.cc
+++ b/mace/ops/sgemm_pack_test.cc
@@ -17,10 +17,10 @@
 #include <random>
 #include <vector>
 
-#include "mace/kernels/sgemm.h"
+#include "mace/ops/sgemm.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 namespace test {
 
 namespace {
@@ -162,6 +162,6 @@ TEST(SGemmPackTest, UnPack) {
 }
 
 }  // namespace test
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
 
diff --git a/mace/kernels/shape.cc b/mace/ops/shape.cc
similarity index 98%
rename from mace/kernels/shape.cc
rename to mace/ops/shape.cc
index 1775f0a0..b981267a 100644
--- a/mace/kernels/shape.cc
+++ b/mace/ops/shape.cc
@@ -15,7 +15,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class ShapeOp : public Operation {
@@ -70,5 +70,5 @@ void RegisterShape(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/shape_test.cc b/mace/ops/shape_test.cc
index 2b66c7eb..0ad72ead 100644
--- a/mace/ops/shape_test.cc
+++ b/mace/ops/shape_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/softmax.cc b/mace/ops/softmax.cc
similarity index 98%
rename from mace/kernels/softmax.cc
rename to mace/ops/softmax.cc
index 1ac3ab4d..8f6fb7fd 100644
--- a/mace/kernels/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -18,15 +18,15 @@
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/fixpoint.h"
-#include "mace/kernels/gemmlowp_util.h"
+#include "mace/ops/fixpoint.h"
+#include "mace/ops/gemmlowp_util.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/softmax.h"
-#include "mace/kernels/opencl/buffer/softmax.h"
+#include "mace/ops/opencl/image/softmax.h"
+#include "mace/ops/opencl/buffer/softmax.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class SoftmaxOp;
@@ -394,5 +394,5 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 482709ad..66e27434 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -14,7 +14,6 @@
 
 #include <string>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -46,7 +45,7 @@ void SoftmaxBenchmark(
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Softmax", "SoftmaxBM")
         .Input("InputImage")
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 98b0ad97..69b5dafd 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -61,7 +60,7 @@ void Simple() {
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Softmax", "SoftmaxTest")
         .Input("InputImage")
@@ -73,7 +72,7 @@ void Simple() {
 
     // Transfer output
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
@@ -117,7 +116,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
   expected->Copy(*net.GetOutput("Output"));
 
   BufferToImage<D, float>(&net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   OpDefBuilder("Softmax", "SoftmaxTest")
       .Input("InputImage")
@@ -129,7 +128,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
 
   // Transfer output
   ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
diff --git a/mace/kernels/space_to_batch.cc b/mace/ops/space_to_batch.cc
similarity index 99%
rename from mace/kernels/space_to_batch.cc
rename to mace/ops/space_to_batch.cc
index 41c731c5..f391bc79 100644
--- a/mace/kernels/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/space_to_batch.h"
+#include "mace/ops/opencl/image/space_to_batch.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 class SpaceToBatchOpBase : public Operation {
  public:
@@ -343,5 +343,5 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index 565ad5dc..f6d5ad1a 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -49,7 +48,7 @@ void BMSpaceToBatch(
         .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
         .Input("InputImage")
         .Output("OutputImage")
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index 1d07ecfc..956dedc2 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -33,7 +33,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -56,7 +56,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
@@ -77,7 +77,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -100,7 +100,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
 
   if (D == GPU) {
     ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   } else if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
@@ -157,7 +157,7 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
 
   // run gpu
   BufferToImage<GPU, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -166,7 +166,7 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
       .Finalize(net.NewOperatorDef());
   net.RunOp(GPU);
   ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -194,7 +194,7 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
 
   // run gpu
   BufferToImage<GPU, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -203,7 +203,7 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
       .Finalize(net.NewOperatorDef());
   net.RunOp(GPU);
   ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
diff --git a/mace/kernels/space_to_depth.cc b/mace/ops/space_to_depth.cc
similarity index 97%
rename from mace/kernels/space_to_depth.cc
rename to mace/ops/space_to_depth.cc
index e2e302e6..11e5ade3 100644
--- a/mace/kernels/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/space_to_depth.h"
+#include "mace/ops/opencl/image/space_to_depth.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, class T>
 class SpaceToDepthOp : public Operation {
@@ -125,5 +125,5 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index 480a0421..04760c54 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -44,7 +43,7 @@ void SpaceToDepth(
       .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
         .Input("InputImage")
diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc
index 76569492..e7ae77d6 100644
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -15,7 +15,6 @@
 #include <fstream>
 
 #include <vector>
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -47,7 +46,7 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
 
   } else {
     BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
         .Input("InputImage")
         .Output("OutputImage")
@@ -59,7 +58,7 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
 
   if (D == DeviceType::GPU) {
     ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          kernels::BufferType::IN_OUT_CHANNEL);
+                                          ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -129,7 +128,7 @@ void RandomTest(const int block_size,
                                                   NHWC);
 
   BufferToImage<D, T>(&net, "Input", "InputImg",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
 
   OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
       .Input("InputImg")
@@ -142,7 +141,7 @@ void RandomTest(const int block_size,
   net.RunOp(D);
 
   ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
diff --git a/mace/kernels/split.cc b/mace/ops/split.cc
similarity index 98%
rename from mace/kernels/split.cc
rename to mace/ops/split.cc
index 68f5f274..1d632329 100644
--- a/mace/kernels/split.cc
+++ b/mace/ops/split.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/split.h"
+#include "mace/ops/opencl/image/split.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class SplitOp;
@@ -129,5 +129,5 @@ void RegisterSplit(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index aa0e8fba..687fc573 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -40,7 +39,7 @@ void BMSplitHelper(int iters,
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
 
     auto builder = OpDefBuilder("Split", "SplitTest");
     builder.Input("InputImage");
diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc
index d42b3716..906a47dd 100644
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
@@ -50,7 +50,7 @@ void RandomTest(const int num_outputs, const int axis) {
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
 
     auto builder = OpDefBuilder("Split", "SplitTest");
     builder.Input("InputImage");
@@ -75,7 +75,7 @@ void RandomTest(const int num_outputs, const int axis) {
     for (int i = 0; i < num_outputs; ++i) {
       ImageToBuffer<D, float>(&net, MakeString("OutputImage", i),
                               MakeString("Output", i),
-                              kernels::BufferType::IN_OUT_CHANNEL);
+                              ops::BufferType::IN_OUT_CHANNEL);
     }
   }
 
diff --git a/mace/kernels/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
similarity index 97%
rename from mace/kernels/sqrdiff_mean.cc
rename to mace/ops/sqrdiff_mean.cc
index e9c7bde0..f25d66c1 100644
--- a/mace/kernels/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -17,11 +17,11 @@
 
 #include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/kernels/opencl/image/sqrdiff_mean.h"
+#include "mace/ops/opencl/image/sqrdiff_mean.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class SqrDiffMeanOp : public Operation {
@@ -114,5 +114,5 @@ void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc
index f3bfd44c..bcf07500 100644
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
@@ -34,9 +34,9 @@ void SqrDiffMean(int iters, int batch, int channels,
 
   if (D == DeviceType::GPU) {
     BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(&net, "Input1", "InputImage1",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM")
         .Input("InputImage")
         .Input("InputImage1")
diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc
index e88810bc..66f852b7 100644
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
@@ -59,9 +59,9 @@ void Simple(const std::vector<index_t> &input_shape0,
                                                     NHWC);
   } else {
     BufferToImage<D, float>(&net, "Input0", "InputImg0",
-                           kernels::BufferType::IN_OUT_CHANNEL);
+                           ops::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, float>(&net, "Input1", "InputImg1",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
         .Input("InputImg0")
         .Input("InputImg1")
@@ -70,7 +70,7 @@ void Simple(const std::vector<index_t> &input_shape0,
     // Run
     net.RunOp(D);
     ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                            ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(output_shape, output);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
@@ -127,9 +127,9 @@ void RandomTest(const std::vector<index_t> &input_shape0,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                   "Output", NHWC);
   BufferToImage<D, T>(&net, "Input0", "InputImg0",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Input1", "InputImg1",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
       .Input("InputImg0")
       .Input("InputImg1")
@@ -138,7 +138,7 @@ void RandomTest(const std::vector<index_t> &input_shape0,
   // Run
   net.RunOp(D);
   ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
                             *net.GetOutput("OPENCLOutput"), 1e-4, 1e-3);
diff --git a/mace/kernels/squeeze.cc b/mace/ops/squeeze.cc
similarity index 97%
rename from mace/kernels/squeeze.cc
rename to mace/ops/squeeze.cc
index 8221bccb..37ee3d70 100644
--- a/mace/kernels/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class SqueezeOp : public Operation {
@@ -59,5 +59,5 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc
index 166d9868..1bcd6c37 100644
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/stack.cc b/mace/ops/stack.cc
similarity index 98%
rename from mace/kernels/stack.cc
rename to mace/ops/stack.cc
index b3fc8bea..de795965 100644
--- a/mace/kernels/stack.cc
+++ b/mace/ops/stack.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class StackOp : public Operation {
@@ -85,5 +85,5 @@ void RegisterStack(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/stack_test.cc b/mace/ops/stack_test.cc
index e55ff278..d63de70a 100644
--- a/mace/ops/stack_test.cc
+++ b/mace/ops/stack_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/strided_slice.cc b/mace/ops/strided_slice.cc
similarity index 99%
rename from mace/kernels/strided_slice.cc
rename to mace/ops/strided_slice.cc
index b030661b..7c60bfe8 100644
--- a/mace/kernels/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class StridedSliceOp : public Operation {
@@ -225,5 +225,5 @@ void RegisterStridedSlice(OpRegistryBase *op_registry) {
 #endif  // MACE_ENABLE_OPENCL
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc
index c13a813c..3ecbedc9 100644
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/transpose.cc b/mace/ops/transpose.cc
similarity index 99%
rename from mace/kernels/transpose.cc
rename to mace/ops/transpose.cc
index 2ec38015..4e98944c 100644
--- a/mace/kernels/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -22,7 +22,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 static void TransposeNHWCToNCHWC3(const float *input,
                                   float *output,
@@ -232,5 +232,5 @@ void RegisterTranspose(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc
index 6d37b93c..f584239a 100644
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -15,7 +15,6 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc
index 44ef0ec2..d9f227c3 100644
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/kernels/unstack.cc b/mace/ops/unstack.cc
similarity index 98%
rename from mace/kernels/unstack.cc
rename to mace/ops/unstack.cc
index 8403b8f6..ee0a4369 100644
--- a/mace/kernels/unstack.cc
+++ b/mace/ops/unstack.cc
@@ -18,7 +18,7 @@
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class UnstackOp : public Operation {
@@ -80,5 +80,5 @@ void RegisterUnstack(OpRegistryBase *op_registry) {
                    DeviceType::CPU, int32_t);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/unstack_test.cc b/mace/ops/unstack_test.cc
index 4c9774ff..92a6bd61 100644
--- a/mace/ops/unstack_test.cc
+++ b/mace/ops/unstack_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc
index 3b126f07..62485165 100644
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -34,15 +33,15 @@ void BMWinogradConvolution(
   net.AddRandomInput<D, T>("Bias", {out_channels});
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      kernels::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+                      ops::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
 
   // Winograd convolution
   // transform filter
     BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                        kernels::BufferType::WINOGRAD_FILTER, block_size);
+                        ops::BufferType::WINOGRAD_FILTER, block_size);
 
   // Inference convolution output shape
   OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest")
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 1c82a189..4d015194 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -14,8 +14,7 @@
 
 #include <fstream>
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -44,10 +43,10 @@ void WinogradConvolution(const index_t batch,
   net.AddRandomInput<D, T>("Bias", {out_channels});
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      kernels::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+                      ops::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputImage")
       .Input("FilterImage")
@@ -63,7 +62,7 @@ void WinogradConvolution(const index_t batch,
 
   // Transfer output
   ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
 
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("ConvOutput"));
@@ -72,7 +71,7 @@ void WinogradConvolution(const index_t batch,
   // Winograd convolution
   // transform filter
   BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER, block_size);
+                      ops::BufferType::WINOGRAD_FILTER, block_size);
   // transform input
   OpDefBuilder("WinogradTransform", "WinogradTransformTest")
       .Input("InputImage")
@@ -123,7 +122,7 @@ void WinogradConvolution(const index_t batch,
   net.Sync();
 
   ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
                             1e-2, 1e-2);
@@ -195,10 +194,10 @@ void WinogradConvolutionWithPad(const index_t batch,
   net.AddRandomInput<D, float>("Bias", {out_channels});
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      kernels::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+                      ops::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputImage")
       .Input("FilterImage")
@@ -214,7 +213,7 @@ void WinogradConvolutionWithPad(const index_t batch,
 
   // Transfer output
   ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("ConvOutput"));
   auto output_shape = expected->shape();
@@ -222,7 +221,7 @@ void WinogradConvolutionWithPad(const index_t batch,
   // Winograd convolution
   // transform filter
   BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER, block_size);
+                      ops::BufferType::WINOGRAD_FILTER, block_size);
   // transform input
   OpDefBuilder("WinogradTransform", "WinogradTransformTest")
       .Input("InputImage")
@@ -273,7 +272,7 @@ void WinogradConvolutionWithPad(const index_t batch,
   net.Sync();
 
   ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
                             1e-2, 1e-2);
diff --git a/mace/kernels/winograd_transform.cc b/mace/ops/winograd_transform.cc
similarity index 93%
rename from mace/kernels/winograd_transform.cc
rename to mace/ops/winograd_transform.cc
index 286bff95..b2635f4d 100644
--- a/mace/kernels/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
@@ -16,12 +16,12 @@
 #include <string>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/opencl/image/winograd_transform.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/opencl/image/winograd_transform.h"
 
 namespace mace {
-namespace kernels {
+namespace ops {
 
 template <DeviceType D, typename T>
 class WinogradTransformOp;
@@ -62,7 +62,7 @@ class WinogradInverseTransformOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit WinogradInverseTransformOp(OpConstructContext *context)
       : Operation(context) {
-    ActivationType activation = kernels::StringToActivationType(
+    ActivationType activation = ops::StringToActivationType(
         Operation::GetOptionalArg<std::string>("activation", "NOOP"));
     float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
     int block_size = Operation::GetOptionalArg<int>("wino_block_size", 2);
@@ -98,5 +98,5 @@ void RegisterWinogradInverseTransform(
                    WinogradInverseTransformOp, DeviceType::GPU, half);
 }
 
-}  // namespace kernels
+}  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
index 5c21c9ad..bb6679bb 100644
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -30,7 +29,7 @@ void BMWinogradTransform(
   net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("WinogradTransform", "WinogradTransformTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -88,7 +87,7 @@ void BMWinogradInverseTransform(
       (block_size + 2), channels, p, 1});
 
   BufferToImage<D, T>(&net, "Input", "InputImage",
-                      kernels::BufferType::IN_OUT_HEIGHT);
+                      ops::BufferType::IN_OUT_HEIGHT);
   OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
       .Input("InputImage")
       .AddIntArg("batch", batch)
@@ -155,7 +154,7 @@ void WinoFilterBufferToImage(int iters,
   OpDefBuilder("BufferToImage", "BufferToImageTest")
       .Input("Input")
       .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER)
+      .AddIntArg("buffer_type", ops::BufferType::WINOGRAD_FILTER)
       .AddIntArg("wino_block_size", wino_block_size)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
@@ -215,9 +214,9 @@ void WinoMatMulBenchmark(
   net.AddRandomInput<D, float>("B", {batch, in_channels, out_width});
 
   if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(&net, "A", "AImage", ops::BufferType::IN_OUT_WIDTH);
     BufferToImage<D, T>(&net, "B", "BImage",
-                        kernels::BufferType::IN_OUT_HEIGHT);
+                        ops::BufferType::IN_OUT_HEIGHT);
 
     OpDefBuilder("MatMul", "MatMulBM")
         .Input("AImage")
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 4b789000..a3064df0 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -122,7 +122,6 @@ message NetDef {
   repeated OperatorDef op = 1;
   repeated Argument arg = 2;
   repeated ConstTensor tensors = 3;
-  repeated string op_types = 4;
 
   // for mem optimization
   optional MemoryArena mem_arena = 10;
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 92a6b12d..a89e3abd 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -214,9 +214,6 @@ def main(unused_args):
             for arg in cpu_graph_def.arg:
                 if arg.name not in output_graph_arg_names:
                     output_graph_def.arg.extend(arg)
-            for op_type in cpu_graph_def.op_types:
-                if op_type not in output_graph_def.op_types:
-                    output_graph_def.op_types.extend([op_type])
             print("Merge done")
         else:
             option.device = device_type_map[FLAGS.runtime]
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 8d3a3b64..1ab81452 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1712,14 +1712,6 @@ class Transformer(base_converter.ConverterInterface):
 
             ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
 
-    def add_op_types(self):
-        net = self._model
-        op_types = set()
-        for op in net.op:
-            op_types.add(op.type)
-        for op_type in op_types:
-            net.op_types.extend([op_type])
-
     def sort_by_execution(self):
         print("Sort by execution")
         net = self._model
@@ -1736,8 +1728,6 @@ class Transformer(base_converter.ConverterInterface):
         del net.op[:]
         net.op.extend(sorted_nodes)
 
-        self.add_op_types()
-
         print("Final ops:")
         for op in net.op:
             print("%s (%s): %s" % (op.name, op.type, [
diff --git a/mace/python/tools/encrypt_opencl_codegen.py b/mace/python/tools/encrypt_opencl_codegen.py
index 957c8a51..776dc506 100644
--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
@@ -19,7 +19,7 @@ import sys
 
 import jinja2
 
-# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/  \
+# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/ops/opencl/cl/  \
 #     --output_path=./mace/codegen/opencl_encrypt/opencl_encrypted_program.cc
 
 FLAGS = None
@@ -96,7 +96,7 @@ def parse_args():
     parser.add_argument(
         "--cl_kernel_dir",
         type=str,
-        default="./mace/kernels/opencl/cl/",
+        default="./mace/ops/opencl/cl/",
         help="The cl kernels directory.")
     parser.add_argument(
         "--output_path",
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index ec1ba284..3f4ba1c4 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -122,12 +122,6 @@ void CreateTensors(NetDef *net_def) {
   {% endfor %}
 }
 
-void CreateOpTypes(NetDef *net_def) {
-  {% for op_type in net.op_types %}
-  net_def->add_op_types({{ op_type|tojson }});
-  {% endfor %}
-}
-
 {% if net.mem_arena.mem_block|length != 0 %}
 void CreateMemoryArena(mace::MemoryArena *mem_arena) {
   mem_arena->mutable_mem_block()->Reserve({{ net.mem_arena.mem_block|length }});
@@ -168,9 +162,6 @@ const std::shared_ptr<NetDef> CreateNet() {
   {% if net.output_info | length > 0 %}
   CreateOutputInfo(net_def.get());
   {% endif %}
-  {% if net.op_types|length > 0 %}
-  CreateOpTypes(net_def.get());
-  {% endif %}
 
   return net_def;
 }
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 0f8d1f49..0bb8342d 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -15,8 +15,8 @@
 #include <fstream>
 #include <thread>  // NOLINT(build/c++11)
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/operator.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -306,7 +306,7 @@ void MaceRunFunc(const int in_out_size) {
     std::string input_name = MakeString("mace_input_node_",
                                         input_names[i]);
     BufferToImage<half>(input_name, input_names[i],
-                        mace::kernels::IN_OUT_CHANNEL,
+                        mace::ops::IN_OUT_CHANNEL,
                         {mem_map[input_names[i]]},
                         device,
                         net_def.get());
@@ -314,7 +314,7 @@ void MaceRunFunc(const int in_out_size) {
     info->set_name(input_names[i]);
   }
   BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::kernels::CONV2D_FILTER, {}, device,
+                      mace::ops::CONV2D_FILTER, {}, device,
                       net_def.get(), NetMode::INIT);
   for (size_t i = 0; i < output_names.size(); ++i) {
     Conv3x3<half>(input_names[i], filter_tensor_img_name,
@@ -326,15 +326,12 @@ void MaceRunFunc(const int in_out_size) {
     std::string output_name = MakeString("mace_output_node_",
                                          output_names[i]);
     ImageToBuffer<float>(output_names[i], output_name,
-                         mace::kernels::IN_OUT_CHANNEL,
+                         mace::ops::IN_OUT_CHANNEL,
                          device,
                          net_def.get());
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
-  for (int i = 0; i < net_def->op_size(); ++i) {
-    net_def->add_op_types(net_def->op(i).type());
-  }
 
   MaceEngineConfig config(DeviceType::GPU);
 
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 54dd99b7..127e5849 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -15,8 +15,7 @@
 
 #include <fstream>
 
-#include "mace/core/op_def_registry.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/public/mace.h"
 
@@ -308,7 +307,7 @@ void MaceRun(const int in_out_size,
     std::string input_name = MakeString("mace_input_node_",
                                         input_names[i]);
     BufferToImage<half>(input_name, input_names[i],
-                        mace::kernels::IN_OUT_CHANNEL,
+                        mace::ops::IN_OUT_CHANNEL,
                         {mem_map[input_names[i]]},
                         device,
                         net_def.get());
@@ -316,7 +315,7 @@ void MaceRun(const int in_out_size,
     info->set_name(input_names[i]);
   }
   BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::kernels::CONV2D_FILTER, {}, device,
+                      mace::ops::CONV2D_FILTER, {}, device,
                       net_def.get(), NetMode::INIT);
   for (size_t i = 0; i < output_names.size(); ++i) {
     Conv3x3<half>(input_names[i], filter_tensor_img_name,
@@ -327,17 +326,13 @@ void MaceRun(const int in_out_size,
     std::string output_name = MakeString("mace_output_node_",
                                          output_names[i]);
     ImageToBuffer<float>(output_names[i], output_name,
-                         mace::kernels::IN_OUT_CHANNEL,
+                         mace::ops::IN_OUT_CHANNEL,
                          device,
                          net_def.get());
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
 
-  for (int i = 0; i < net_def->op_size(); ++i) {
-    net_def->add_op_types(net_def->op(i).type());
-  }
-
   MaceEngineConfig config(DeviceType::GPU);
 
   MaceEngine engine(config);
diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl
index 97c9639c..c844a5d3 100644
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -18,51 +18,51 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
     unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
 
   ret = repository_ctx.execute(
-      ["test", "-f", "%s/mace/kernels/opencl/cl/common.h" % mace_root_path])
+      ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path])
   if ret.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/activation.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/addn.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_norm.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_to_space.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/bias_add.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_to_image.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_transform.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/channel_shuffle.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/common.h"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/concat.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_3x3.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/crop.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/deconv_2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depth_to_space.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/eltwise.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/fully_connected.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/lstmcell.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/matmul.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pad.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/reduce_mean.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bicubic.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bilinear.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/split.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_batch.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_depth.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/sqrdiff_mean.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/winograd_transform.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/addn.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_norm.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_to_space.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/bias_add.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_to_image.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_transform.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/channel_shuffle.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/common.h"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/concat.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1_buffer.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_3x3.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_buffer.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/crop.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/deconv_2d.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depth_to_space.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/eltwise.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/fully_connected.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lstmcell.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/matmul.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pad.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling_buffer.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/reduce_mean.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bicubic.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bilinear.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/split.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax_buffer.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_batch.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_depth.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/sqrdiff_mean.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/winograd_transform.cl"))
 
   python_bin_path = repository_ctx.which("python")
 
   repository_ctx.execute([
       python_bin_path, '%s/mace/python/tools/encrypt_opencl_codegen.py' % mace_root_path,
-      '--cl_kernel_dir=%s/mace/kernels/opencl/cl' % mace_root_path,
+      '--cl_kernel_dir=%s/mace/ops/opencl/cl' % mace_root_path,
       '--output_path=%s/encrypt_opencl_kernel' % generated_files_path
   ], quiet=False)
 
diff --git a/tools/bazel.rc b/tools/bazel.rc
index bd1f79c7..1863738e 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -32,6 +32,7 @@ build:arm_linux --copt -Wno-ignored-attributes
 build:arm_linux --copt -Wno-unused-function
 build:arm_linux --copt -Wno-sequence-point
 build:arm_linux --copt -Wno-implicit-fallthrough
+build:arm_linux --copt -Wno-psabi
 
 # Usage example: bazel build --config aarch64_linux
 build:aarch64_linux --config=cross_compile
diff --git a/tools/converter.py b/tools/converter.py
index 86bedae9..4ad1a8b5 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -673,7 +673,7 @@ def download_file(url, dst, num_retries=3):
     try:
         urllib.request.urlretrieve(url, dst)
         MaceLogger.info('\nDownloaded successfully.')
-    except (urllib.URLError, urllib.ContentTooShortError) as e:
+    except urllib.ContentTooShortError as e:
         MaceLogger.warning('Download error:', e.reason)
         if num_retries > 0:
             return download_file(url, dst, num_retries - 1)
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 267dccfc..9c8045f1 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -410,7 +410,7 @@ def bazel_target_to_bin(target):
 ################################
 def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
     sh.mkdir("-p", "%s/opencl" % codegen_path)
-    encrypt_opencl_codegen("./mace/kernels/opencl/cl/",
+    encrypt_opencl_codegen("./mace/ops/opencl/cl/",
                            "mace/codegen/opencl/opencl_encrypt_program.cc")
 
 
@@ -680,7 +680,7 @@ def create_internal_storage_dir(serialno, phone_data_dir):
 
 def push_depended_so_libs(libmace_dynamic_library_path,
                           abi, phone_data_dir, serialno):
-    dep_so_libs = sh.bash("/opt/android-ndk/ndk-depends",
+    dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
                           libmace_dynamic_library_path)
     for dep in split_stdout(dep_so_libs):
         if dep == "libgnustl_shared.so":
-- 
GitLab